From ebb4378237e572ce2e888136a613c7c051439f95 Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Mon, 11 Apr 2022 08:17:19 -0700 Subject: [ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding This allows for the ability to opt-in to a method to set the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will use the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] https://github.com/ruby/net-http/commit/6233e6b7c1 Co-authored-by: Yui Naruse --- lib/net/http.rb | 14 +++++ lib/net/http/response.rb | 159 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 173 insertions(+) (limited to 'lib/net') diff --git a/lib/net/http.rb b/lib/net/http.rb index 3fcf23b05c..5e64e38665 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -698,6 +698,7 @@ module Net #:nodoc: @continue_timeout = nil @max_retries = 1 @debug_output = nil + @response_body_encoding = false @proxy_from_env = false @proxy_uri = nil @@ -745,6 +746,18 @@ module Net #:nodoc: # The local port used to establish the connection. attr_accessor :local_port + # The encoding to use for the response body. If Encoding, uses the + # specified encoding. If other true value, tries to detect the response + # body encoding. + attr_reader :response_body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def response_body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @response_body_encoding = value + end + attr_writer :proxy_from_env attr_writer :proxy_address attr_writer :proxy_port @@ -1592,6 +1605,7 @@ module Net #:nodoc: begin res = HTTPResponse.read_new(@socket) res.decode_content = req.decode_content + res.body_encoding = @response_body_encoding end while res.kind_of?(HTTPInformation) res.uri = req.uri diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index 08eaeb2cac..ecbfd42d2b 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -84,6 +84,7 @@ class Net::HTTPResponse @read = false @uri = nil @decode_content = false + @body_encoding = false end # The HTTP version supported by the server. @@ -106,6 +107,18 @@ class Net::HTTPResponse # Accept-Encoding header from the user. attr_accessor :decode_content + # The encoding to use for the response body. If Encoding, use that encoding. + # If other true value, attempt to detect the appropriate encoding, and use + # that. + attr_reader :body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @body_encoding = value + end + def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" end @@ -214,6 +227,17 @@ class Net::HTTPResponse end @read = true + case enc = @body_encoding + when Encoding, false, nil + # Encoding: force given encoding + # false/nil: do not force encoding + else + # other value: detect encoding from body + enc = detect_encoding(@body) + end + + @body.force_encoding(enc) if enc + @body end @@ -245,6 +269,141 @@ class Net::HTTPResponse private + # :nodoc: + def detect_encoding(str, encoding=nil) + if encoding + elsif encoding = type_params['charset'] + elsif encoding = check_bom(str) + else + encoding = case content_type&.downcase + when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml} + /\A' + ss.getch + return nil + end + name = ss.scan(/[^=\t\n\f\r \/>]*/) + name.downcase! + raise if name.empty? + ss.skip(/[\t\n\f\r ]*/) + if ss.getch != '=' + value = '' + return [name, value] + end + ss.skip(/[\t\n\f\r ]*/) + case ss.peek(1) + when '"' + ss.getch + value = ss.scan(/[^"]+/) + value.downcase! + ss.getch + when "'" + ss.getch + value = ss.scan(/[^']+/) + value.downcase! + ss.getch + when '>' + value = '' + else + value = ss.scan(/[^\t\n\f\r >]+/) + value.downcase! + end + [name, value] + end + + def extracting_encodings_from_meta_elements(value) + # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element + if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value + return $1 || $2 || $3 + end + return nil + end + ## # Checks for a supported Content-Encoding header and yields an Inflate # wrapper for this response's socket when zlib is present. If the -- cgit v1.2.3