summaryrefslogtreecommitdiff
path: root/lib/net
diff options
context:
space:
mode:
authorJeremy Evans <code@jeremyevans.net>2022-04-11 08:17:19 -0700
committergit <svn-admin@ruby-lang.org>2022-04-12 00:17:34 +0900
commitebb4378237e572ce2e888136a613c7c051439f95 (patch)
tree13f22663021500a3aa223fcd175db1a05caf246e /lib/net
parent4bd38e8120f2fdfdd47a34211720e048502377f1 (diff)
[ruby/net-http] Add HTTP#response_body_encoding for setting response body encoding
This allows for the ability to opt-in to a method to set the encoding of response bodies. By setting the accessor to a String or Encoding instance, it will use the specified encoding. Setting the value of true will try to detect the encoding of the response body, either using the Content-Type header (assuming it specifies charset) or by scanning for a <meta> tag in the document that specifies the encoding. The default is false in which case no forcing of encoding will be done (same as before the patch). Implements [Feature #2567] Implements [Feature #15517] https://github.com/ruby/net-http/commit/6233e6b7c1 Co-authored-by: Yui Naruse <naruse@ruby-lang.org>
Diffstat (limited to 'lib/net')
-rw-r--r--lib/net/http.rb14
-rw-r--r--lib/net/http/response.rb159
2 files changed, 173 insertions, 0 deletions
diff --git a/lib/net/http.rb b/lib/net/http.rb
index 3fcf23b05c..5e64e38665 100644
--- a/lib/net/http.rb
+++ b/lib/net/http.rb
@@ -698,6 +698,7 @@ module Net #:nodoc:
@continue_timeout = nil
@max_retries = 1
@debug_output = nil
+ @response_body_encoding = false
@proxy_from_env = false
@proxy_uri = nil
@@ -745,6 +746,18 @@ module Net #:nodoc:
# The local port used to establish the connection.
attr_accessor :local_port
+ # The encoding to use for the response body. If Encoding, uses the
+ # specified encoding. If other true value, tries to detect the response
+ # body encoding.
+ attr_reader :response_body_encoding
+
+ # Set the encoding to use for the response body. If given a String, find
+ # the related Encoding.
+ def response_body_encoding=(value)
+ value = Encoding.find(value) if value.is_a?(String)
+ @response_body_encoding = value
+ end
+
attr_writer :proxy_from_env
attr_writer :proxy_address
attr_writer :proxy_port
@@ -1592,6 +1605,7 @@ module Net #:nodoc:
begin
res = HTTPResponse.read_new(@socket)
res.decode_content = req.decode_content
+ res.body_encoding = @response_body_encoding
end while res.kind_of?(HTTPInformation)
res.uri = req.uri
diff --git a/lib/net/http/response.rb b/lib/net/http/response.rb
index 08eaeb2cac..ecbfd42d2b 100644
--- a/lib/net/http/response.rb
+++ b/lib/net/http/response.rb
@@ -84,6 +84,7 @@ class Net::HTTPResponse
@read = false
@uri = nil
@decode_content = false
+ @body_encoding = false
end
# The HTTP version supported by the server.
@@ -106,6 +107,18 @@ class Net::HTTPResponse
# Accept-Encoding header from the user.
attr_accessor :decode_content
+ # The encoding to use for the response body. If Encoding, use that encoding.
+ # If other true value, attempt to detect the appropriate encoding, and use
+ # that.
+ attr_reader :body_encoding
+
+ # Set the encoding to use for the response body. If given a String, find
+ # the related Encoding.
+ def body_encoding=(value)
+ value = Encoding.find(value) if value.is_a?(String)
+ @body_encoding = value
+ end
+
def inspect
"#<#{self.class} #{@code} #{@message} readbody=#{@read}>"
end
@@ -214,6 +227,17 @@ class Net::HTTPResponse
end
@read = true
+ case enc = @body_encoding
+ when Encoding, false, nil
+ # Encoding: force given encoding
+ # false/nil: do not force encoding
+ else
+ # other value: detect encoding from body
+ enc = detect_encoding(@body)
+ end
+
+ @body.force_encoding(enc) if enc
+
@body
end
@@ -245,6 +269,141 @@ class Net::HTTPResponse
private
+ # :nodoc:
+ def detect_encoding(str, encoding=nil)
+ if encoding
+ elsif encoding = type_params['charset']
+ elsif encoding = check_bom(str)
+ else
+ encoding = case content_type&.downcase
+ when %r{text/x(?:ht)?ml|application/(?:[^+]+\+)?xml}
+ /\A<xml[ \t\r\n]+
+ version[ \t\r\n]*=[ \t\r\n]*(?:"[0-9.]+"|'[0-9.]*')[ \t\r\n]+
+ encoding[ \t\r\n]*=[ \t\r\n]*
+ (?:"([A-Za-z][\-A-Za-z0-9._]*)"|'([A-Za-z][\-A-Za-z0-9._]*)')/x =~ str
+ encoding = $1 || $2 || Encoding::UTF_8
+ when %r{text/html.*}
+ sniff_encoding(str)
+ end
+ end
+ return encoding
+ end
+
+ # :nodoc:
+ def sniff_encoding(str, encoding=nil)
+ # the encoding sniffing algorithm
+ # http://www.w3.org/TR/html5/parsing.html#determining-the-character-encoding
+ if enc = scanning_meta(str)
+ enc
+ # 6. last visited page or something
+ # 7. frequency
+ elsif str.ascii_only?
+ Encoding::US_ASCII
+ elsif str.dup.force_encoding(Encoding::UTF_8).valid_encoding?
+ Encoding::UTF_8
+ end
+ # 8. implementation-defined or user-specified
+ end
+
+ # :nodoc:
+ def check_bom(str)
+ case str.byteslice(0, 2)
+ when "\xFE\xFF"
+ return Encoding::UTF_16BE
+ when "\xFF\xFE"
+ return Encoding::UTF_16LE
+ end
+ if "\xEF\xBB\xBF" == str.byteslice(0, 3)
+ return Encoding::UTF_8
+ end
+ nil
+ end
+
+ # :nodoc:
+ def scanning_meta(str)
+ require 'strscan'
+ ss = StringScanner.new(str)
+ if ss.scan_until(/<meta[\t\n\f\r ]*/)
+ attrs = {} # attribute_list
+ got_pragma = false
+ need_pragma = nil
+ charset = nil
+
+ # step: Attributes
+ while attr = get_attribute(ss)
+ name, value = *attr
+ next if attrs[name]
+ attrs[name] = true
+ case name
+ when 'http-equiv'
+ got_pragma = true if value == 'content-type'
+ when 'content'
+ encoding = extracting_encodings_from_meta_elements(value)
+ unless charset
+ charset = encoding
+ end
+ need_pragma = true
+ when 'charset'
+ need_pragma = false
+ charset = value
+ end
+ end
+
+ # step: Processing
+ return if need_pragma.nil?
+ return if need_pragma && !got_pragma
+
+ charset = Encoding.find(charset) rescue nil
+ return unless charset
+ charset = Encoding::UTF_8 if charset == Encoding::UTF_16
+ return charset # tentative
+ end
+ nil
+ end
+
+ def get_attribute(ss)
+ ss.scan(/[\t\n\f\r \/]*/)
+ if ss.peek(1) == '>'
+ ss.getch
+ return nil
+ end
+ name = ss.scan(/[^=\t\n\f\r \/>]*/)
+ name.downcase!
+ raise if name.empty?
+ ss.skip(/[\t\n\f\r ]*/)
+ if ss.getch != '='
+ value = ''
+ return [name, value]
+ end
+ ss.skip(/[\t\n\f\r ]*/)
+ case ss.peek(1)
+ when '"'
+ ss.getch
+ value = ss.scan(/[^"]+/)
+ value.downcase!
+ ss.getch
+ when "'"
+ ss.getch
+ value = ss.scan(/[^']+/)
+ value.downcase!
+ ss.getch
+ when '>'
+ value = ''
+ else
+ value = ss.scan(/[^\t\n\f\r >]+/)
+ value.downcase!
+ end
+ [name, value]
+ end
+
+ def extracting_encodings_from_meta_elements(value)
+ # http://dev.w3.org/html5/spec/fetching-resources.html#algorithm-for-extracting-an-encoding-from-a-meta-element
+ if /charset[\t\n\f\r ]*=(?:"([^"]*)"|'([^']*)'|["']|\z|([^\t\n\f\r ;]+))/i =~ value
+ return $1 || $2 || $3
+ end
+ return nil
+ end
+
##
# Checks for a supported Content-Encoding header and yields an Inflate
# wrapper for this response's socket when zlib is present. If the