summaryrefslogtreecommitdiff
path: root/lib/rexml/source.rb
diff options
context:
space:
mode:
Diffstat (limited to 'lib/rexml/source.rb')
-rw-r--r--lib/rexml/source.rb148
1 files changed, 97 insertions, 51 deletions
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index ce7a2c98b0..af65cf4751 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -1,3 +1,5 @@
+# coding: US-ASCII
+# frozen_string_literal: false
require 'rexml/encoding'
module REXML
@@ -7,13 +9,14 @@ module REXML
# @param arg Either a String, or an IO
# @return a Source, or nil if a bad argument was given
def SourceFactory::create_from(arg)
- if arg.kind_of? String
- Source.new(arg)
- elsif arg.respond_to? :read and
- arg.respond_to? :readline and
- arg.respond_to? :nil? and
- arg.respond_to? :eof?
+ if arg.respond_to? :read and
+ arg.respond_to? :readline and
+ arg.respond_to? :nil? and
+ arg.respond_to? :eof?
IOSource.new(arg)
+ elsif arg.respond_to? :to_str
+ require 'stringio'
+ IOSource.new(StringIO.new(arg))
elsif arg.kind_of? Source
arg
else
@@ -42,7 +45,7 @@ module REXML
if encoding
self.encoding = encoding
else
- self.encoding = check_encoding( @buffer )
+ detect_encoding
end
@line = 0
end
@@ -52,22 +55,16 @@ module REXML
# Overridden to support optimized en/decoding
def encoding=(enc)
return unless super
- @line_break = encode( '>' )
- if enc != UTF_8
- @buffer = decode(@buffer)
- @to_utf = true
- else
- @to_utf = false
- end
+ encoding_updated
end
# Scans the source for a given pattern. Note, that this is not your
# usual scan() method. For one thing, the pattern argument has some
# requirements; for another, the source can be consumed. You can easily
# confuse this method. Originally, the patterns were easier
- # to construct and this method more robust, because this method
- # generated search regexes on the fly; however, this was
- # computationally expensive and slowed down the entire REXML package
+ # to construct and this method more robust, because this method
+ # generated search regexps on the fly; however, this was
+ # computationally expensive and slowed down the entire REXML package
# considerably, since this is by far the most commonly called method.
# @param pattern must be a Regexp, and must be in the form of
# /^\s*(#{your pattern, with no groups})(.*)/. The first group
@@ -123,6 +120,38 @@ module REXML
res = res[-1] if res.kind_of? Array
lines.index( res ) if res
end
+
+ private
+ def detect_encoding
+ buffer_encoding = @buffer.encoding
+ detected_encoding = "UTF-8"
+ begin
+ @buffer.force_encoding("ASCII-8BIT")
+ if @buffer[0, 2] == "\xfe\xff"
+ @buffer[0, 2] = ""
+ detected_encoding = "UTF-16BE"
+ elsif @buffer[0, 2] == "\xff\xfe"
+ @buffer[0, 2] = ""
+ detected_encoding = "UTF-16LE"
+ elsif @buffer[0, 3] == "\xef\xbb\xbf"
+ @buffer[0, 3] = ""
+ detected_encoding = "UTF-8"
+ end
+ ensure
+ @buffer.force_encoding(buffer_encoding)
+ end
+ self.encoding = detected_encoding
+ end
+
+ def encoding_updated
+ if @encoding != 'UTF-8'
+ @buffer = decode(@buffer)
+ @to_utf = true
+ else
+ @to_utf = false
+ @buffer.force_encoding ::Encoding::UTF_8
+ end
+ end
end
# A Source that wraps an IO. See the Source class for method
@@ -134,30 +163,22 @@ module REXML
def initialize(arg, block_size=500, encoding=nil)
@er_source = @source = arg
@to_utf = false
+ @pending_buffer = nil
- # Determining the encoding is a deceptively difficult issue to resolve.
- # First, we check the first two bytes for UTF-16. Then we
- # assume that the encoding is at least ASCII enough for the '>', and
- # we read until we get one of those. This gives us the XML declaration,
- # if there is one. If there isn't one, the file MUST be UTF-8, as per
- # the XML spec. If there is one, we can determine the encoding from
- # it.
- @buffer = ""
- str = @source.read( 2 )
if encoding
- self.encoding = encoding
- elsif 0xfe == str[0] && 0xff == str[1]
- @line_break = "\000>"
- elsif 0xff == str[0] && 0xfe == str[1]
- @line_break = ">\000"
- elsif 0xef == str[0] && 0xbb == str[1]
- str += @source.read(1)
- str = '' if (0xbf == str[2])
- @line_break = ">"
+ super("", encoding)
+ else
+ super(@source.read(3) || "")
+ end
+
+ if !@to_utf and
+ @buffer.respond_to?(:force_encoding) and
+ @source.respond_to?(:external_encoding) and
+ @source.external_encoding != ::Encoding::UTF_8
+ @force_utf8 = true
else
- @line_break = ">"
+ @force_utf8 = false
end
- super str+@source.readline( @line_break )
end
def scan(pattern, cons=false)
@@ -165,16 +186,12 @@ module REXML
# You'll notice that this next section is very similar to the same
# section in match(), but just a liiittle different. This is
# because it is a touch faster to do it this way with scan()
- # than the way match() does it; enough faster to warrent duplicating
+ # than the way match() does it; enough faster to warrant duplicating
# some code
if rv.size == 0
until @buffer =~ pattern or @source.nil?
begin
- # READLINE OPT
- #str = @source.read(@block_size)
- str = @source.readline(@line_break)
- str = decode(str) if @to_utf and str
- @buffer << str
+ @buffer << readline
rescue Iconv::IllegalSequence
raise
rescue
@@ -189,9 +206,7 @@ module REXML
def read
begin
- str = @source.readline(@line_break)
- str = decode(str) if @to_utf and str
- @buffer << str
+ @buffer << readline
rescue Exception, NameError
@source = nil
end
@@ -206,9 +221,7 @@ module REXML
@buffer = $' if cons and rv
while !rv and @source
begin
- str = @source.readline(@line_break)
- str = decode(str) if @to_utf and str
- @buffer << str
+ @buffer << readline
rv = pattern.match(@buffer)
@buffer = $' if cons and rv
rescue
@@ -218,13 +231,13 @@ module REXML
rv.taint
rv
end
-
+
def empty?
super and ( @source.nil? || @source.eof? )
end
def position
- @er_source.stat.pipe? ? 0 : @er_source.pos
+ @er_source.pos rescue 0
end
# @return the current line in the source
@@ -247,5 +260,38 @@ module REXML
end
[pos, lineno, line]
end
+
+ private
+ def readline
+ str = @source.readline(@line_break)
+ if @pending_buffer
+ if str.nil?
+ str = @pending_buffer
+ else
+ str = @pending_buffer + str
+ end
+ @pending_buffer = nil
+ end
+ return nil if str.nil?
+
+ if @to_utf
+ decode(str)
+ else
+ str.force_encoding(::Encoding::UTF_8) if @force_utf8
+ str
+ end
+ end
+
+ def encoding_updated
+ case @encoding
+ when "UTF-16BE", "UTF-16LE"
+ @source.binmode
+ @source.set_encoding(@encoding, @encoding)
+ end
+ @line_break = encode(">")
+ @pending_buffer, @buffer = @buffer, ""
+ @pending_buffer.force_encoding(@encoding)
+ super
+ end
end
end