From 7d21c237ccd46ec1d56639ce53b5882bf97d9de3 Mon Sep 17 00:00:00 2001 From: ser Date: Fri, 10 Oct 2003 12:54:46 +0000 Subject: * Changes to the encoding mechanism. If iconv is found, it is used first for encoding changes. This should be the case on all 1.8 installations. When it isn't found (<1.6), the native REXML encoding mechanism is used. This cleaned out some files, and tightened up the code a bit; and iconv should be faster than the pure Ruby code. * Changed deprecated assert_not_nil to assert throughout the tests. * Parse exceptions are a little more verbose, and extend RuntimeError. * Bug fixes to XPathParser * The Light API is still shifting, like the sands of the desert. * Fixed a new Ruby 1.8.0 warning, added some speed optimizations, and tightened error reporting in the base parser git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@4737 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/encoding.rb | 74 ++++++++++++++------------------ lib/rexml/encodings/EUC-JP.rb | 24 +++++------ lib/rexml/encodings/EUC-JP_decl.rb | 6 --- lib/rexml/encodings/ICONV.rb | 14 +++++++ lib/rexml/encodings/ISO-8859-1.rb | 4 +- lib/rexml/encodings/ISO-8859-1_decl.rb | 6 --- lib/rexml/encodings/Shift-JIS_decl.rb | 6 --- lib/rexml/encodings/UNILE.rb | 4 +- lib/rexml/encodings/UNILE_decl.rb | 6 --- lib/rexml/encodings/US-ASCII.rb | 4 +- lib/rexml/encodings/US-ASCII_decl.rb | 6 --- lib/rexml/encodings/UTF-16.rb | 4 +- lib/rexml/encodings/UTF-16_decl.rb | 6 --- lib/rexml/encodings/UTF-8.rb | 11 +++++ lib/rexml/light/node.rb | 77 ++++++++++------------------------ lib/rexml/output.rb | 4 -- lib/rexml/parseexception.rb | 21 ++++++---- lib/rexml/parsers/baseparser.rb | 53 +++++++++++++---------- lib/rexml/parsers/lightparser.rb | 14 +++---- lib/rexml/quickpath.rb | 2 +- lib/rexml/rexml.rb | 4 +- lib/rexml/source.rb | 30 +++++++++---- lib/rexml/xpath_parser.rb | 25 ++++------- 23 files changed, 183 insertions(+), 222 deletions(-) delete mode 100644 lib/rexml/encodings/EUC-JP_decl.rb create mode 100644 lib/rexml/encodings/ICONV.rb delete mode 100644 lib/rexml/encodings/ISO-8859-1_decl.rb delete mode 100644 lib/rexml/encodings/Shift-JIS_decl.rb delete mode 100644 lib/rexml/encodings/UNILE_decl.rb delete mode 100644 lib/rexml/encodings/US-ASCII_decl.rb delete mode 100644 lib/rexml/encodings/UTF-16_decl.rb create mode 100644 lib/rexml/encodings/UTF-8.rb diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index 06385d8d52..ad8ba7e342 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -2,61 +2,49 @@ module REXML module Encoding @@uconv_available = false - ENCODING_CLAIMS = { } - - def Encoding.claim( encoding_str, match=nil ) - if match - ENCODING_CLAIMS[ match ] = encoding_str - else - ENCODING_CLAIMS[ /^\s* Encoding name attr_reader :encoding def encoding=( enc ) - enc = UTF_8 unless enc - @encoding = enc.upcase - require "rexml/encodings/#@encoding" unless @encoding == UTF_8 - end - - def check_encoding str - rv = ENCODING_CLAIMS.find{|k,v| str =~ k } - # Raise an exception if there is a declared encoding and we don't - # recognize it - unless rv - if str =~ /^\s* err + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + begin + load enc_file + rescue LoadError + raise Exception.new( "No decoder found for encoding #@encoding. Please install iconv." ) + end + end else - return UTF_8 + enc = UTF_8 + @encoding = enc.upcase + load 'rexml/encodings/UTF-8.rb' end + ensure + $VERBOSE = old_verbosity end - return rv[1] end - def to_utf_8(str) - return str - end - - def from_utf_8 content - return content - end - end - - module Encodingses - encodings = [] - $:.each do |incl_dir| - if Dir[ File.join(incl_dir, 'rexml', 'encodings') ].size > 0 - encodings |= Dir[ File.join(incl_dir, 'rexml', 'encodings', '*_decl.rb') ] - end - encodings.collect!{ |f| File.basename(f) } - encodings.uniq! + def check_encoding str + # We have to recognize UTF-16, LSB UTF-16, and UTF-8 + return UTF_16 if str[0] == 254 && str[1] == 255 + return UNILE if str[0] == 255 && str[1] == 254 + str =~ /^\s* B + # a.b # => B + # a.b[1] # => B + # a.b[1]["x"] = "y" # => B + # a.b[0].c # => B + # a.b.c << "D" # => BD module REXML module Light # Represents a tagged XML element. Elements are characterized by # having children, attributes, and names, and can themselves be # children. - class Node < Array - alias :_old_get :[] - alias :_old_put :[]= - + class Node NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u + PARENTS = [ :element, :document, :doctype ] # Create a new element. def initialize node=nil + @node = node if node.kind_of? String node = [ :text, node ] elsif node.nil? node = [ :document, nil, nil ] elsif node[0] == :start_element node[0] = :element + elsif node[0] == :start_doctype + node[0] = :doctype + elsif node[0] == :start_document + node[0] = :document end - replace( node ) - _old_put( 1, 0, 1 ) - _old_put( 1, nil ) end def size - el!() - super-4 + if PARENTS.include? @node[0] + @node[-1].size + else + 0 + end end def each( &block ) - el!() size.times { |x| yield( at(x+4) ) } end def name - el!() at(2) end def name=( name_str, ns=nil ) - el!() pfx = '' pfx = "#{prefix(ns)}:" if ns - _old_put(1, "#{pfx}#{name_str}") + _old_put(2, "#{pfx}#{name_str}") end def parent=( node ) @@ -78,28 +60,23 @@ module REXML end def local_name - el!() namesplit @name end def local_name=( name_str ) - el!() _old_put( 1, "#@prefix:#{name_str}" ) end def prefix( namespace=nil ) - el!() prefix_of( self, namespace ) end def namespace( prefix=prefix() ) - el!() namespace_of( self, prefix ) end def namespace=( namespace ) - el!() @prefix = prefix( namespace ) pfx = '' pfx = "#@prefix:" if @prefix.size > 0 @@ -107,7 +84,6 @@ module REXML end def []( reference, ns=nil ) - el!() if reference.kind_of? String pfx = '' pfx = "#{prefix(ns)}:" if ns @@ -125,7 +101,6 @@ module REXML # Doesn't handle namespaces yet def []=( reference, ns, value=nil ) - el!() if reference.kind_of? String value = ns unless value at( 3 )[reference] = value @@ -170,12 +145,10 @@ module REXML end def has_name?( name, namespace = '' ) - el!() at(3) == name and namespace() == namespace end def children - el!() self end @@ -187,14 +160,6 @@ module REXML end - def el! - if node_type() != :element and node_type() != :document - _old_put( 0, :element ) - push({}) - end - self - end - private def namesplit diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb index 7d4ab2e13b..c4a7473bfb 100644 --- a/lib/rexml/output.rb +++ b/lib/rexml/output.rb @@ -8,10 +8,6 @@ module REXML @output = real_IO self.encoding = encd - eval <<-EOL - alias :encode :to_#{encoding.tr('-', '_').downcase} - alias :decode :from_#{encoding.tr('-', '_').downcase} - EOL @to_utf = encd == UTF_8 ? false : true end diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index 04928d9175..0fee3ae620 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -1,5 +1,5 @@ module REXML - class ParseException < Exception + class ParseException < RuntimeError attr_accessor :source, :parser, :continued_exception def initialize( message, source=nil, parser=nil, exception=nil ) @@ -12,9 +12,9 @@ module REXML def to_s # Quote the original exception, if there was one if @continued_exception - err = @continued_exception.message + err = @continued_exception.inspect err << "\n" - err << @continued_exception.backtrace[0..3].join("\n") + err << @continued_exception.backtrace.join("\n") err << "\n...\n" else err = "" @@ -24,17 +24,24 @@ module REXML err << super # Add contextual information - err << "\n#{@source.current_line}\nLast 80 unconsumed characters:\n#{@source.buffer[0..80].gsub(/\n/, ' ')}\n" if @source - err << "\nContext:\n#{@parser.context}" if @parser + if @source + err << "\nLine: #{line}\n" + err << "Position: #{position}\n" + err << "Last 80 unconsumed characters:\n" + err << @source.buffer[0..80].gsub(/\n/, ' ') + err << "\n" + err << @source.buffer[0..80].unpack("U*").inspect + end + err end def position - @source.current_line[0] if @source + @source.current_line[0] if @source and @source.current_line end def line - @source.current_line[2] if @source + @source.current_line[2] if @source and @source.current_line end def context diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index d6e04c7817..27c9642a68 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -89,10 +89,10 @@ module REXML EREFERENCE = /&(?!#{NAME};)/ DEFAULT_ENTITIES = { - 'gt' => [/>/, '>', '>'], - 'lt' => [/</, '<', '<'], - 'quot' => [/"/, '"', '"'], - "apos" => [/'/, "'", "'"] + 'gt' => [/>/, '>', '>', />/], + 'lt' => [/</, '<', '<', / [/"/, '"', '"', /"/], + "apos" => [/'/, "'", "'", /'/] } def initialize( source ) @@ -126,6 +126,7 @@ module REXML # Returns true if there are more events. Synonymous with !empty? def has_next? + return true if @closed @source.read if @source.buffer.size==0 and !@source.empty? (!@source.empty? and @source.buffer.strip.size>0) or @stack.size>0 or @closed end @@ -143,7 +144,7 @@ module REXML # event, so you can effectively pre-parse the entire document (pull the # entire thing into memory) using this method. def peek depth=0 - raise 'Illegal argument "#{depth}"' if depth < -1 + raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] if depth == -1 temp.push(pull()) until empty? @@ -166,8 +167,9 @@ module REXML return @stack.shift if @stack.size > 0 @source.read if @source.buffer.size==0 if @document_status == nil - @source.match( /^\s*/um, true ) - word = @source.match( /^\s*(<.*?)>/um ) + @source.consume( /^\s*/um ) + word = @source.match( /(<.*?)>/um ) + #word = @source.match_to( '>', /(<.*?)>/um ) word = word[1] unless word.nil? case word when COMMENT_START @@ -190,7 +192,7 @@ module REXML close = md[2] identity =~ IDENTITY name = $1 - raise "DOCTYPE is missing a name" if name.nil? + raise REXML::ParseException("DOCTYPE is missing a name") if name.nil? pub_sys = $2.nil? ? nil : $2.strip long_name = $3.nil? ? nil : $3.strip uri = $4.nil? ? nil : $4.strip @@ -274,10 +276,11 @@ module REXML return [ :end_doctype ] end end - begin + begin if @source.buffer[0] == ?< if @source.buffer[1] == ?/ last_tag = @tags.pop + #md = @source.match_to_consume( '>', CLOSE_MATCH) md = @source.match( CLOSE_MATCH, true ) raise REXML::ParseException.new( "Missing end tag for '#{last_tag}' "+ "(got \"#{md[1]}\")", @source) unless last_tag == md[1] @@ -286,18 +289,20 @@ module REXML md = @source.match(/\A(\s*[^>]*>)/um) #puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" raise REXML::ParseException.new("Malformed node", @source) unless md - case md[1] - when CDATA_START - return [ :cdata, @source.match( CDATA_PATTERN, true )[1] ] - when COMMENT_START - return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + if md[0][2] == ?- + md = @source.match( COMMENT_PATTERN, true ) + return [ :comment, md[1] ] if md else - raise REXML::ParseException.new( "Declarations can only occur "+ - "in the doctype declaration.", @source) + md = @source.match( CDATA_PATTERN, true ) + return [ :cdata, md[1] ] if md end + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) elsif @source.buffer[1] == ?? md = @source.match( INSTRUCTION_PATTERN, true ) - return [ :processing_instruction, md[1], md[2] ] + return [ :processing_instruction, md[1], md[2] ] if md + raise REXML::ParseException.new( "Bad instruction declaration", + @source) else # Get the next tag md = @source.match(TAG_MATCH, true) @@ -318,17 +323,19 @@ module REXML return [ :start_element, md[1], attributes ] end else - md = @source.match(TEXT_PATTERN, true) - raise "no text to add" if md[0].length == 0 + md = @source.match( TEXT_PATTERN, true ) + #md = @source.match_to_consume( '<', TEXT_PATTERN ) + #@source.read + raise REXML::ParseException("no text to add") if md[0].length == 0 # unnormalized = Text::unnormalize( md[1], self ) # return PullEvent.new( :text, md[1], unnormalized ) return [ :text, md[1] ] end - rescue REXML::ParseException - raise $! + rescue REXML::ParseException + raise rescue Exception, NameError => error raise REXML::ParseException.new( "Exception parsing", - @source, self, error ) + @source, self, (error ? error : $!) ) end return [ :dummy ] end @@ -354,7 +361,7 @@ module REXML end if entities copy.gsub!( EREFERENCE, '&' ) DEFAULT_ENTITIES.each do |key, value| - copy.gsub!( value[2], value[1] ) + copy.gsub!( value[3], value[1] ) end copy end diff --git a/lib/rexml/parsers/lightparser.rb b/lib/rexml/parsers/lightparser.rb index e2f083bc8e..8c555f7960 100644 --- a/lib/rexml/parsers/lightparser.rb +++ b/lib/rexml/parsers/lightparser.rb @@ -16,25 +16,25 @@ module REXML end def parse - root = context = REXML::Light::Node.new([ :document ]) + root = context = [ :document ] while true event = @parser.pull case event[0] when :end_document break when :end_doctype - context = context.parent + context = context[1] when :start_element, :start_doctype - new_node = REXML::Light::Node.new(event) + new_node = event context << new_node - new_node.parent = context + new_node[1,0] = [context] context = new_node when :end_element, :end_doctype - context = context.parent + context = context[1] else - new_node = REXML::Light::Node.new(event) + new_node = event context << new_node - new_node.parent = context + new_node[1,0] = [context] end end root diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb index c099db8579..2c54ac1999 100644 --- a/lib/rexml/quickpath.rb +++ b/lib/rexml/quickpath.rb @@ -31,7 +31,7 @@ module REXML results = filter([element], path) when /^\*/u results = filter(element.to_a, path) - when /^[\[!\w:]/u + when /^[[!\w:]/u # match on child matches = [] children = element.to_a diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index b7de03f3f9..8c402dd23f 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -21,6 +21,6 @@ # A tutorial is available in docs/tutorial.html module REXML Copyright = "Copyright #{Time.now.year} Sean Russell " - Date = "+2003/110" - Version = "2.7.1" + Date = "+2003/283" + Version = "2.7.2" end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 8c175785b7..915b6efc27 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -39,10 +39,6 @@ module REXML # Overridden to support optimized en/decoding def encoding=(enc) super - eval <<-EOL - alias :encode :to_#{encoding.tr('-', '_').downcase} - alias :decode :from_#{encoding.tr('-', '_').downcase} - EOL @line_break = encode( '>' ) if enc != UTF_8 @buffer = decode(@buffer) @@ -78,8 +74,22 @@ module REXML def read end + def consume( pattern ) + @buffer = $' if pattern.match( @buffer ) + end + + def match_to( char, pattern ) + return pattern.match(@buffer) + end + + def match_to_consume( char, pattern ) + md = pattern.match(@buffer) + @buffer = $' + return md + end + def match pattern, consume=false - md = pattern.match @buffer + md = pattern.match(@buffer) @buffer = $' if consume and md return md end @@ -112,7 +122,9 @@ module REXML #@block_size = block_size #super @source.read(@block_size) @line_break = '>' - super @source.readline( @line_break ) + #super @source.readline( "\n" ) + super @source.readline( @line_break )+@source.read + @line_break = encode( '>' ) end def scan pattern, consume=false @@ -145,11 +157,15 @@ module REXML str = @source.readline('>') str = decode(str) if @to_utf and str @buffer << str - rescue + rescue Exception, NameError @source = nil end end + def consume( pattern ) + match( pattern, true ) + end + def match pattern, consume=false rv = pattern.match(@buffer) @buffer = $' if consume and rv diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index 215078b766..9cd1e5d64c 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -2,16 +2,6 @@ require 'rexml/namespace' require 'rexml/xmltokens' require 'rexml/parsers/xpathparser' -# Ignore this class. It adds a __ne__ method, because Ruby doesn't seem to -# understand object.send( "!=", foo ), whereas it *does* understand "<", "==", -# and all of the other comparison methods. Stupid, and annoying, and not at -# all POLS. -class Object - def __ne__(b) - self != b - end -end - module REXML # You don't want to use this class. Really. Use XPath, which is a wrapper # for this class. Believe me. You don't want to poke around in here. @@ -132,11 +122,10 @@ module REXML when :child #puts "CHILD" new_nodeset = [] - ps_clone = nil + nt = nil for node in nodeset - #ps_clone = path_stack.clone - #new_nodeset += internal_parse( ps_clone, node.children ) if node.parent? - new_nodeset += node.children if node.parent? + nt = node.node_type + new_nodeset += node.children if nt == :element or nt == :document end #path_stack[0,(path_stack.size-ps_clone.size)] = [] return new_nodeset @@ -238,9 +227,11 @@ module REXML when :descendant #puts ":DESCENDANT" results = [] + nt = nil for node in nodeset + nt = node.node_type results += internal_parse( path_stack.clone.unshift( :descendant_or_self ), - node.children ) if node.parent? + node.children ) if nt == :element or nt == :document end return results @@ -310,11 +301,13 @@ module REXML def d_o_s( p, ns, r ) #puts r.collect{|n|n.to_s}.inspect #puts ns.collect{|n|n.to_s}.inspect + nt = nil ns.each_index do |i| n = ns[i] x = match( p.clone, [ n ] ) #puts "Got a match on #{p.inspect} for #{ns.collect{|n|n.to_s+"("+n.type.to_s+")"}.inspect}" - d_o_s( p, n.children, x ) if n.parent? + nt = n.node_type + d_o_s( p, n.children, x ) if nt == :element or nt == :document r[i,0] = [x] if x.size > 0 end end -- cgit v1.2.3