From ebb4378237e572ce2e888136a613c7c051439f95 Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Mon, 11 Apr 2022 08:17:19 -0700 Subject: [ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding This allows for the ability to opt-in to a method to set the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will use the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] https://github.com/ruby/net-http/commit/6233e6b7c1 Co-authored-by: Yui Naruse --- lib/net/http.rb | 14 +++ lib/net/http/response.rb | 159 +++++++++++++++++++++++++ test/net/http/test_http.rb | 54 +++++++++ test/net/http/test_httpresponse.rb | 235 +++++++++++++++++++++++++++++++++++++ 4 files changed, 462 insertions(+) diff --git a/lib/net/http.rb b/lib/net/http.rb index 3fcf23b05c..5e64e38665 100644 --- a/lib/net/http.rb +++ b/lib/net/http.rb @@ -698,6 +698,7 @@ module Net #:nodoc: @continue_timeout = nil @max_retries = 1 @debug_output = nil + @response_body_encoding = false @proxy_from_env = false @proxy_uri = nil @@ -745,6 +746,18 @@ module Net #:nodoc: # The local port used to establish the connection. attr_accessor :local_port + # The encoding to use for the response body. If Encoding, uses the + # specified encoding. If other true value, tries to detect the response + # body encoding. + attr_reader :response_body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def response_body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @response_body_encoding = value + end + attr_writer :proxy_from_env attr_writer :proxy_address attr_writer :proxy_port @@ -1592,6 +1605,7 @@ module Net #:nodoc: begin res = HTTPResponse.read_new(@socket) res.decode_content = req.decode_content + res.body_encoding = @response_body_encoding end while res.kind_of?(HTTPInformation) res.uri = req.uri diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb index 08eaeb2cac..ecbfd42d2b 100644 --- a/lib/net/http/response.rb +++ b/lib/net/http/response.rb @@ -84,6 +84,7 @@ class Net::HTTPResponse @read = false @uri = nil @decode_content = false + @body_encoding = false end # The HTTP version supported by the server. @@ -106,6 +107,18 @@ class Net::HTTPResponse # Accept-Encoding header from the user. attr_accessor :decode_content + # The encoding to use for the response body. If Encoding, use that encoding. + # If other true value, attempt to detect the appropriate encoding, and use + # that. + attr_reader :body_encoding + + # Set the encoding to use for the response body. If given a String, find + # the related Encoding. + def body_encoding=(value) + value = Encoding.find(value) if value.is_a?(String) + @body_encoding = value + end + def inspect "#<#{self.class} #{@code} #{@message} readbody=#{@read}>" end @@ -214,6 +227,17 @@ class Net::HTTPResponse end @read = true + case enc = @body_encoding + when Encoding, false, nil + # Encoding: force given encoding + # false/nil: do not force encoding + else + # other value: detect encoding from body + enc = detect_encoding(@body) + end + + @body.force_encoding(enc) if enc + @body end @@ -245,6 +269,141 @@ class Net::HTTPResponse private + # :nodoc: + def detect_encoding(str, encoding=nil) + if encoding + elsif encoding = type_params['charset'] + elsif encoding = check_bom(str) + else + encoding = case content_type&.downcase + when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml} + /\A' + ss.getch + return nil + end + name = ss.scan(/[^=\t\n\f\r \/>]*/) + name.downcase! + raise if name.empty? + ss.skip(/[\t\n\f\r ]*/) + if ss.getch != '=' + value = '' + return [name, value] + end + ss.skip(/[\t\n\f\r ]*/) + case ss.peek(1) + when '"' + ss.getch + value = ss.scan(/[^"]+/) + value.downcase! + ss.getch + when "'" + ss.getch + value = ss.scan(/[^']+/) + value.downcase! + ss.getch + when '>' + value = '' + else + value = ss.scan(/[^\t\n\f\r >]+/) + value.downcase! + end + [name, value] + end + + def extracting_encodings_from_meta_elements(value) + # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element + if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value + return $1 || $2 || $3 + end + return nil + end + ## # Checks for a supported Content-Encoding header and yields an Inflate # wrapper for this response's socket when zlib is present. If the diff --git a/test/net/http/test_http.rb b/test/net/http/test_http.rb index b5156078a4..4725a79147 100644 --- a/test/net/http/test_http.rb +++ b/test/net/http/test_http.rb @@ -1294,3 +1294,57 @@ class TestNetHTTPLocalBind < Test::Unit::TestCase end end +class TestNetHTTPForceEncoding < Test::Unit::TestCase + CONFIG = { + 'host' => 'localhost', + 'proxy_host' => nil, + 'proxy_port' => nil, + } + + include TestNetHTTPUtils + + def fe_request(force_enc, content_type=nil) + @server.mount_proc('/fe') do |req, res| + res['Content-Type'] = content_type if content_type + res.body = "hello\u1234" + end + + http = Net::HTTP.new(config('host'), config('port')) + http.local_host = Addrinfo.tcp(config('host'), config('port')).ip_address + assert_not_nil(http.local_host) + assert_nil(http.local_port) + + http.response_body_encoding = force_enc + http.get('/fe') + end + + def test_response_body_encoding_false + res = fe_request(false) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_response_body_encoding_true_without_content_type + res = fe_request(true) + assert_equal("hello\u1234".b, res.body) + assert_equal(Encoding::ASCII_8BIT, res.body.encoding) + end + + def test_response_body_encoding_true_with_content_type + res = fe_request(true, 'text/html; charset=utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_response_body_encoding_string_without_content_type + res = fe_request('utf-8') + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end + + def test_response_body_encoding_encoding_without_content_type + res = fe_request(Encoding::UTF_8) + assert_equal("hello\u1234", res.body) + assert_equal(Encoding::UTF_8, res.body.encoding) + end +end diff --git a/test/net/http/test_httpresponse.rb b/test/net/http/test_httpresponse.rb index 86a467ac19..eb2551df46 100644 --- a/test/net/http/test_httpresponse.rb +++ b/test/net/http/test_httpresponse.rb @@ -54,6 +54,241 @@ EOS assert_equal 'hello', body end + def test_read_body_body_encoding_false + body = "hello\u1234" + io = dummy_io(<hello\u1234" + io = dummy_io(<hello\u1234" + io = dummy_io(<