summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-10-28 12:31:20 +0000
committerkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-10-28 12:31:20 +0000
commit2a42c1bd3a8fcaa59f778070bff0eac60757ced3 (patch)
treebcd9a2aa1f23d4f35bfd7fd30ad6ecc58303d43a
parent100b3be9ae7f6ad0af53dd87d0e9706df6e813c3 (diff)
* lib/rexml/source.rb (REXML::IOSource#initialize): Reduce
@line_break initialize code. It should be done only in #encoding=. * lib/rexml/parsers/baseparser.rb: Don't set UTF-16 encoding to source by encoding="UTF-16" in XML declaration because UTF-16XX source encoding should be set in Source#initialize or IOSource#intialize. They should handle BOM. Parser should not consider about it. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37361 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog10
-rw-r--r--lib/rexml/parsers/baseparser.rb11
-rw-r--r--lib/rexml/source.rb43
3 files changed, 50 insertions, 14 deletions
diff --git a/ChangeLog b/ChangeLog
index ffc27525ba..329f6d6a88 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+Sun Oct 28 21:25:11 2012 Kouhei Sutou <kou@cozmixng.org>
+
+ * lib/rexml/source.rb (REXML::IOSource#initialize): Reduce
+ @line_break initialize code. It should be done only in #encoding=.
+ * lib/rexml/parsers/baseparser.rb: Don't set UTF-16 encoding to
+ source by encoding="UTF-16" in XML declaration because UTF-16XX
+ source encoding should be set in Source#initialize or
+ IOSource#intialize. They should handle BOM. Parser should not
+ consider about it.
+
Sun Oct 28 21:18:37 2012 Kouhei Sutou <kou@cozmixng.org>
* test/rexml/test_document.rb: Add tests for parsing XML encoded
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index ebffdaa8c7..dc4a1c8bee 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -212,7 +212,9 @@ module REXML
version = version[1] unless version.nil?
encoding = ENCODING.match(results)
encoding = encoding[1] unless encoding.nil?
- @source.encoding = encoding
+ if need_source_encoding_update?(encoding)
+ @source.encoding = encoding
+ end
standalone = STANDALONE.match(results)
standalone = standalone[1] unless standalone.nil?
return [ :xmldecl, version, encoding, standalone ]
@@ -493,6 +495,13 @@ module REXML
end
rv
end
+
+ private
+ def need_source_encoding_update?(xml_declaration_encoding)
+ return false if xml_declaration_encoding.nil?
+ return false if /\AUTF-16\z/i =~ xml_declaration_encoding
+ true
+ end
end
end
end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 112393cfd4..c15f63dcc8 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -144,22 +144,39 @@ module REXML
# if there is one. If there isn't one, the file MUST be UTF-8, as per
# the XML spec. If there is one, we can determine the encoding from
# it.
- @buffer = ""
- str = @source.read( 2 ) || ''
if encoding
- self.encoding = encoding
- elsif str[0,2] == "\xfe\xff"
- @line_break = "\000>"
- elsif str[0,2] == "\xff\xfe"
- @line_break = ">\000"
- elsif str[0,2] == "\xef\xbb"
- str += @source.read(1)
- str = '' if (str[2,1] == "\xBF")
- @line_break = ">"
+ super("", encoding)
else
- @line_break = ">"
+ need_super_with_line = false
+ str = @source.read( 2 ) || ''
+ str.force_encoding("ASCII-8BIT")
+ if str[0, 2] == "\xfe\xff"
+ @source.binmode
+ @source.set_encoding("UTF-16BE")
+ super("", "UTF-16BE")
+ elsif str[0, 2] == "\xff\xfe"
+ @source.binmode
+ @source.set_encoding("UTF-16LE")
+ super("", "UTF-16LE")
+ elsif str[0, 2] == "\xef\xbb"
+ str += @source.read(1)
+ if str[2, 1] == "\xBF"
+ @source.set_encoding("UTF-8")
+ super("", "UTF-8")
+ else
+ need_super_with_line = true
+ end
+ else
+ need_super_with_line = true
+ end
+ if need_super_with_line
+ if @source.eof?
+ super(str)
+ else
+ super(str + @source.readline(">"))
+ end
+ end
end
- super( @source.eof? ? str : str+@source.readline( @line_break ) )
if !@to_utf and
@buffer.respond_to?(:force_encoding) and