summaryrefslogtreecommitdiff
path: root/lib/rexml
diff options
context:
space:
mode:
authorkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-10-28 14:52:21 +0000
committerkou <kou@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2012-10-28 14:52:21 +0000
commit718813ca9b2228274b32a9119023074be116fe9b (patch)
tree657746efc41a8c6e4dbb30248db8e127f1f15932 /lib/rexml
parent7ba54654a5f102be085bda53d4f117dfc0b06d8f (diff)
* lib/rexml/source.rb: Move encoding detection code to base class.
* lib/rexml/encoding.rb: Remove needless encoding detection code. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@37365 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rexml')
-rw-r--r--lib/rexml/encoding.rb13
-rw-r--r--lib/rexml/source.rb101
2 files changed, 56 insertions, 58 deletions
diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb
index d1d5172841..2863af9147 100644
--- a/lib/rexml/encoding.rb
+++ b/lib/rexml/encoding.rb
@@ -20,19 +20,6 @@ module REXML
true
end
- def check_encoding(xml)
- # We have to recognize UTF-16BE, UTF-16LE, and UTF-8
- if xml[0, 2] == "\xfe\xff"
- xml[0, 2] = ""
- return 'UTF-16BE'
- elsif xml[0, 2] == "\xff\xfe"
- xml[0, 2] = ""
- return 'UTF-16LE'
- end
- xml =~ /^\s*<\?xml\s+version\s*=\s*(['"]).*?\1\s+encoding\s*=\s*(["'])(.*?)\2/m
- return $3 ? $3.upcase : 'UTF-8'
- end
-
def encode(string)
string.encode(@encoding)
end
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index c15f63dcc8..eec1b6c7e8 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -43,7 +43,7 @@ module REXML
if encoding
self.encoding = encoding
else
- self.encoding = check_encoding( @buffer )
+ detect_encoding
end
@line = 0
end
@@ -53,14 +53,7 @@ module REXML
# Overridden to support optimized en/decoding
def encoding=(enc)
return unless super
- @line_break = encode( '>' )
- if @encoding != 'UTF-8'
- @buffer = decode(@buffer)
- @to_utf = true
- else
- @to_utf = false
- @buffer.force_encoding ::Encoding::UTF_8
- end
+ encoding_updated
end
# Scans the source for a given pattern. Note, that this is not your
@@ -125,6 +118,38 @@ module REXML
res = res[-1] if res.kind_of? Array
lines.index( res ) if res
end
+
+ private
+ def detect_encoding
+ buffer_encoding = @buffer.encoding
+ detected_encoding = "UTF-8"
+ begin
+ @buffer.force_encoding("ASCII-8BIT")
+ if @buffer[0, 2] == "\xfe\xff"
+ @buffer[0, 2] = ""
+ detected_encoding = "UTF-16BE"
+ elsif @buffer[0, 2] == "\xff\xfe"
+ @buffer[0, 2] = ""
+ detected_encoding = "UTF-16LE"
+ elsif @buffer[0, 3] == "\xef\xbb\xbf"
+ @buffer[0, 3] = ""
+ detected_encoding = "UTF-8"
+ end
+ ensure
+ @buffer.force_encoding(buffer_encoding)
+ end
+ self.encoding = detected_encoding
+ end
+
+ def encoding_updated
+ if @encoding != 'UTF-8'
+ @buffer = decode(@buffer)
+ @to_utf = true
+ else
+ @to_utf = false
+ @buffer.force_encoding ::Encoding::UTF_8
+ end
+ end
end
# A Source that wraps an IO. See the Source class for method
@@ -136,46 +161,12 @@ module REXML
def initialize(arg, block_size=500, encoding=nil)
@er_source = @source = arg
@to_utf = false
+ @pending_buffer = nil
- # Determining the encoding is a deceptively difficult issue to resolve.
- # First, we check the first two bytes for UTF-16. Then we
- # assume that the encoding is at least ASCII enough for the '>', and
- # we read until we get one of those. This gives us the XML declaration,
- # if there is one. If there isn't one, the file MUST be UTF-8, as per
- # the XML spec. If there is one, we can determine the encoding from
- # it.
if encoding
super("", encoding)
else
- need_super_with_line = false
- str = @source.read( 2 ) || ''
- str.force_encoding("ASCII-8BIT")
- if str[0, 2] == "\xfe\xff"
- @source.binmode
- @source.set_encoding("UTF-16BE")
- super("", "UTF-16BE")
- elsif str[0, 2] == "\xff\xfe"
- @source.binmode
- @source.set_encoding("UTF-16LE")
- super("", "UTF-16LE")
- elsif str[0, 2] == "\xef\xbb"
- str += @source.read(1)
- if str[2, 1] == "\xBF"
- @source.set_encoding("UTF-8")
- super("", "UTF-8")
- else
- need_super_with_line = true
- end
- else
- need_super_with_line = true
- end
- if need_super_with_line
- if @source.eof?
- super(str)
- else
- super(str + @source.readline(">"))
- end
- end
+ super(@source.read(3) || "")
end
if !@to_utf and
@@ -271,6 +262,14 @@ module REXML
private
def readline
str = @source.readline(@line_break)
+ if @pending_buffer
+ if str.nil?
+ str = @pending_buffer
+ else
+ str = @pending_buffer + str
+ end
+ @pending_buffer = nil
+ end
return nil if str.nil?
if @to_utf
@@ -280,5 +279,17 @@ module REXML
str
end
end
+
+ def encoding_updated
+ case @encoding
+ when "UTF-16BE", "UTF-16LE"
+ @source.binmode
+ @source.set_encoding(@encoding)
+ end
+ @line_break = encode(">")
+ @pending_buffer, @buffer = @buffer, ""
+ @pending_buffer.force_encoding(@encoding)
+ super
+ end
end
end