From 544c019df27d721b6707b15f72f9603bd4950532 Mon Sep 17 00:00:00 2001 From: knu Date: Fri, 18 Apr 2008 07:22:13 +0000 Subject: * lib/rexml: Merge fixes since 1.8.6 made solely on the ruby_1_8_6 branch. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_8_7@16068 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/source.rb | 361 ++++++++++++++++++++++++++-------------------------- 1 file changed, 184 insertions(+), 177 deletions(-) (limited to 'lib/rexml/source.rb') diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 6cbc368d50..ce7a2c98b0 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,139 +1,140 @@ require 'rexml/encoding' module REXML - # Generates Source-s. USE THIS CLASS. - class SourceFactory - # Generates a Source object - # @param arg Either a String, or an IO - # @return a Source, or nil if a bad argument was given - def SourceFactory::create_from(arg) + # Generates Source-s. USE THIS CLASS. + class SourceFactory + # Generates a Source object + # @param arg Either a String, or an IO + # @return a Source, or nil if a bad argument was given + def SourceFactory::create_from(arg) if arg.kind_of? String - Source.new(arg) + Source.new(arg) elsif arg.respond_to? :read and arg.respond_to? :readline and arg.respond_to? :nil? and arg.respond_to? :eof? - IOSource.new(arg) + IOSource.new(arg) elsif arg.kind_of? Source arg else raise "#{arg.class} is not a valid input stream. It must walk \n"+ - "like either a String, IO, or Source." + "like either a String, an IO, or a Source." end - end - end - - # A Source can be searched for patterns, and wraps buffers and other - # objects and provides consumption of text - class Source - include Encoding - # The current buffer (what we're going to read next) - attr_reader :buffer - # The line number of the last consumed text - attr_reader :line - attr_reader :encoding - - # Constructor - # @param arg must be a String, and should be a valid XML document + end + end + + # A Source can be searched for patterns, and wraps buffers and other + # objects and provides consumption of text + class Source + include Encoding + # The current buffer (what we're going to read next) + attr_reader :buffer + # The line number of the last consumed text + attr_reader :line + attr_reader :encoding + + # Constructor + # @param arg must be a String, and should be a valid XML document # @param encoding if non-null, sets the encoding of the source to this # value, overriding all encoding detection - def initialize(arg, encoding=nil) - @orig = @buffer = arg + def initialize(arg, encoding=nil) + @orig = @buffer = arg if encoding self.encoding = encoding else self.encoding = check_encoding( @buffer ) end - @line = 0 - end - - - # Inherited from Encoding - # Overridden to support optimized en/decoding - def encoding=(enc) - return unless super - @line_break = encode( '>' ) - if enc != UTF_8 - @buffer = decode(@buffer) - @to_utf = true - else - @to_utf = false - end - end - - # Scans the source for a given pattern. Note, that this is not your - # usual scan() method. For one thing, the pattern argument has some - # requirements; for another, the source can be consumed. You can easily - # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexes on the fly; however, this was - # computationally expensive and slowed down the entire REXML package - # considerably, since this is by far the most commonly called method. - # @param pattern must be a Regexp, and must be in the form of - # /^\s*(#{your pattern, with no groups})(.*)/. The first group - # will be returned; the second group is used if the consume flag is - # set. - # @param consume if true, the pattern returned will be consumed, leaving - # everything after it in the Source. - # @return the pattern, if found, or nil if the Source is empty or the - # pattern is not found. - def scan(pattern, cons=false) - return nil if @buffer.nil? - rv = @buffer.scan(pattern) - @buffer = $' if cons and rv.size>0 - rv - end - - def read - end - - def consume( pattern ) - @buffer = $' if pattern.match( @buffer ) - end - - def match_to( char, pattern ) - return pattern.match(@buffer) - end - - def match_to_consume( char, pattern ) - md = pattern.match(@buffer) - @buffer = $' - return md - end - - def match(pattern, cons=false) - md = pattern.match(@buffer) - @buffer = $' if cons and md - return md - end - - # @return true if the Source is exhausted - def empty? - @buffer == "" - end + @line = 0 + end + + + # Inherited from Encoding + # Overridden to support optimized en/decoding + def encoding=(enc) + return unless super + @line_break = encode( '>' ) + if enc != UTF_8 + @buffer = decode(@buffer) + @to_utf = true + else + @to_utf = false + end + end + + # Scans the source for a given pattern. Note, that this is not your + # usual scan() method. For one thing, the pattern argument has some + # requirements; for another, the source can be consumed. You can easily + # confuse this method. Originally, the patterns were easier + # to construct and this method more robust, because this method + # generated search regexes on the fly; however, this was + # computationally expensive and slowed down the entire REXML package + # considerably, since this is by far the most commonly called method. + # @param pattern must be a Regexp, and must be in the form of + # /^\s*(#{your pattern, with no groups})(.*)/. The first group + # will be returned; the second group is used if the consume flag is + # set. + # @param consume if true, the pattern returned will be consumed, leaving + # everything after it in the Source. + # @return the pattern, if found, or nil if the Source is empty or the + # pattern is not found. + def scan(pattern, cons=false) + return nil if @buffer.nil? + rv = @buffer.scan(pattern) + @buffer = $' if cons and rv.size>0 + rv + end + + def read + end + + def consume( pattern ) + @buffer = $' if pattern.match( @buffer ) + end + + def match_to( char, pattern ) + return pattern.match(@buffer) + end + + def match_to_consume( char, pattern ) + md = pattern.match(@buffer) + @buffer = $' + return md + end + + def match(pattern, cons=false) + md = pattern.match(@buffer) + @buffer = $' if cons and md + return md + end + + # @return true if the Source is exhausted + def empty? + @buffer == "" + end def position @orig.index( @buffer ) end - # @return the current line in the source - def current_line - lines = @orig.split - res = lines.grep @buffer[0..30] - res = res[-1] if res.kind_of? Array - lines.index( res ) if res - end - end + # @return the current line in the source + def current_line + lines = @orig.split + res = lines.grep @buffer[0..30] + res = res[-1] if res.kind_of? Array + lines.index( res ) if res + end + end - # A Source that wraps an IO. See the Source class for method - # documentation - class IOSource < Source - #attr_reader :block_size + # A Source that wraps an IO. See the Source class for method + # documentation + class IOSource < Source + #attr_reader :block_size # block_size has been deprecated - def initialize(arg, block_size=500, encoding=nil) - @er_source = @source = arg - @to_utf = false + def initialize(arg, block_size=500, encoding=nil) + @er_source = @source = arg + @to_utf = false + # Determining the encoding is a deceptively difficult issue to resolve. # First, we check the first two bytes for UTF-16. Then we # assume that the encoding is at least ASCII enough for the '>', and @@ -145,88 +146,94 @@ module REXML str = @source.read( 2 ) if encoding self.encoding = encoding - elsif /\A(?:\xfe\xff|\xff\xfe)/n =~ str - self.encoding = check_encoding( str ) + elsif 0xfe == str[0] && 0xff == str[1] + @line_break = "\000>" + elsif 0xff == str[0] && 0xfe == str[1] + @line_break = ">\000" + elsif 0xef == str[0] && 0xbb == str[1] + str += @source.read(1) + str = '' if (0xbf == str[2]) + @line_break = ">" else - @line_break = '>' + @line_break = ">" end super str+@source.readline( @line_break ) end - def scan(pattern, cons=false) - rv = super - # You'll notice that this next section is very similar to the same - # section in match(), but just a liiittle different. This is - # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrent duplicating - # some code - if rv.size == 0 - until @buffer =~ pattern or @source.nil? - begin - # READLINE OPT - #str = @source.read(@block_size) - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + def scan(pattern, cons=false) + rv = super + # You'll notice that this next section is very similar to the same + # section in match(), but just a liiittle different. This is + # because it is a touch faster to do it this way with scan() + # than the way match() does it; enough faster to warrent duplicating + # some code + if rv.size == 0 + until @buffer =~ pattern or @source.nil? + begin + # READLINE OPT + #str = @source.read(@block_size) + str = @source.readline(@line_break) + str = decode(str) if @to_utf and str + @buffer << str rescue Iconv::IllegalSequence raise - rescue - @source = nil - end - end - rv = super - end - rv.taint - rv - end - - def read - begin + rescue + @source = nil + end + end + rv = super + end + rv.taint + rv + end + + def read + begin str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str - rescue Exception, NameError - @source = nil - end - end - - def consume( pattern ) - match( pattern, true ) - end - - def match( pattern, cons=false ) - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - while !rv and @source - begin + str = decode(str) if @to_utf and str + @buffer << str + rescue Exception, NameError + @source = nil + end + end + + def consume( pattern ) + match( pattern, true ) + end + + def match( pattern, cons=false ) + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + while !rv and @source + begin str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str - rv = pattern.match(@buffer) - @buffer = $' if cons and rv - rescue - @source = nil - end - end - rv.taint - rv - end - - def empty? - super and ( @source.nil? || @source.eof? ) - end + str = decode(str) if @to_utf and str + @buffer << str + rv = pattern.match(@buffer) + @buffer = $' if cons and rv + rescue + @source = nil + end + end + rv.taint + rv + end + + def empty? + super and ( @source.nil? || @source.eof? ) + end def position @er_source.stat.pipe? ? 0 : @er_source.pos end - # @return the current line in the source - def current_line + # @return the current line in the source + def current_line begin - pos = @er_source.pos # The byte position in the source - lineno = @er_source.lineno # The XML < position in the source + pos = @er_source.pos # The byte position in the source + lineno = @er_source.lineno # The XML < position in the source @er_source.rewind - line = 0 # The \r\n position in the source + line = 0 # The \r\n position in the source begin while @er_source.pos < pos @er_source.readline @@ -238,7 +245,7 @@ module REXML pos = -1 line = -1 end - [pos, lineno, line] - end - end + [pos, lineno, line] + end + end end -- cgit v1.2.3