From fa4bfa6af585589e4465831f1489fee83ce26f09 Mon Sep 17 00:00:00 2001 From: ser Date: Sat, 20 Jan 2007 03:56:02 +0000 Subject: Merged from REXML main repository: Fixes ticket:68. NOTE that this involves an API change! Entity declarations in the doctype now generate events that carry two, not one, arguments. Implements ticket:15, using gwrite's suggestion. This allows Element to be subclassed. Two unrelated changes, because subversion is retarded and doesn't do block-level commits: 1) Fixed a typo bug in previous change for ticket:15 2) Fixed namespaces handling in XPath and element. ***** Note that this is an API change!!! ***** Element.namespaces() now returns a hash of namespace mappings which are relevant for that node. Fixes a bug in multiple decodings The changeset 1230:1231 was bad. The default behavior is *not* to use the native REXML encodings by default, but rather to use ICONV by default. I know that this will piss some people off, but defaulting to the pure Ruby version isn't the correct solution, and it breaks other encodings, so I've reverted it. * Fixes ticket:61 (xpath_parser) * Fixes ticket:63 (UTF-16; UNILE decoding was bad) * Cleans up some tests, removing opportunities for test corruption * Improves parsing error messages a little * Adds the ability to override the encoding detection in Source construction * Fixes an edge case in Functions::string, where document nodes weren't correctly converted * Fixes Functions::string() for Element and Document nodes * Fixes some problems in entity handling Addresses ticket:66 Fixes ticket:71 Addresses ticket:78 NOTE: that this also fixes what is technically another bug in REXML. REXML's XPath parser used to allow exponential notation in numbers. The XPath spec is specific about what a number is, and scientific notation is not included. Therefore, this has been fixed. Cross-ported a fix for ticket:88 from CVS. Fixes ticket:80 Documentation cleanup. Ticket:84 Applied Kou's fix for an un-trac'ed bug. ------------------------------------------------------------------------ git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@11548 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/document.rb | 5 ++-- lib/rexml/element.rb | 25 ++++++++----------- lib/rexml/encoding.rb | 26 ++++++++++---------- lib/rexml/encodings/UNILE.rb | 2 +- lib/rexml/functions.rb | 25 +++++++++++++++---- lib/rexml/node.rb | 6 ++--- lib/rexml/parsers/baseparser.rb | 4 ---- lib/rexml/parsers/sax2parser.rb | 4 ++++ lib/rexml/parsers/treeparser.rb | 3 ++- lib/rexml/sax2listener.rb | 2 +- lib/rexml/source.rb | 23 +++++++++++++----- lib/rexml/text.rb | 47 ++++++++++++++++++------------------ lib/rexml/xpath_parser.rb | 53 ++++++++++++++++++++++++++++++++++------- 13 files changed, 142 insertions(+), 83 deletions(-) (limited to 'lib/rexml') diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 619a844257..ee3e58dd2b 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -157,8 +157,9 @@ module REXML # document will be written. # indent:: # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. Defaults to -1 + # indentation will be twice this number of spaces, and children will be + # indented an additional amount. For a value of 3, every item will be + # indented 3 more levels, or 6 more spaces (2 * 3). Defaults to -1 # transitive:: # If transitive is true and indent is >= 0, then the output will be # pretty-printed in such a way that the added whitespace does not affect diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 435076420a..11e2039609 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -94,7 +94,7 @@ module REXML # new_a = d.root.clone # puts new_a # => "" def clone - Element.new self + self.class.new self end # Evaluates to the root node of the document that this element @@ -200,9 +200,9 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} namespaces = parent.namespaces if parent - namespaces |= attributes.namespaces + namespaces = namespaces.merge( attributes.namespaces ) return namespaces end @@ -494,13 +494,12 @@ module REXML # doc.root.add_element 'c' #-> 'Elliott' # doc.root.text = 'Russell' #-> 'Russell' # doc.root.text = nil #-> '' - def text=( text ) + def text=( text ) if text.kind_of? String text = Text.new( text, whitespace(), nil, raw() ) elsif text and !text.kind_of? Text text = Text.new( text.to_s, whitespace(), nil, raw() ) end - old_text = get_text if text.nil? old_text.remove unless old_text.nil? @@ -557,13 +556,9 @@ module REXML ################################################# def attribute( name, namespace=nil ) - prefix = '' - if namespace - prefix = attributes.prefixes.each { |prefix| - return "#{prefix}:" if namespace( prefix ) == namespace - } || '' - end - attributes.get_attribute( "#{prefix}#{name}" ) + prefix = nil + prefix = namespaces.index(namespace) if namespace + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) end # Evaluates to +true+ if this element has any attributes set, false @@ -1172,16 +1167,16 @@ module REXML end def namespaces - namespaces = [] + namespaces = {} each_attribute do |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' end if @element.document and @element.document.doctype expn = @element.expanded_name expn = @element.document.doctype.name if expn.size == 0 @element.document.doctype.attributes_of(expn).each { |attribute| - namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + namespaces[attribute.name] = attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' } end namespaces diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index 3a92080b13..7c23f678bb 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -24,20 +24,20 @@ module REXML old_verbosity = $VERBOSE begin $VERBOSE = false - enc = enc.nil? ? nil : enc.upcase + enc = enc.nil? ? nil : enc.upcase return false if defined? @encoding and enc == @encoding if enc and enc != UTF_8 - @encoding = enc - raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ - @encoding.untaint - enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) - begin - require enc_file - Encoding.apply(self, @encoding) + @encoding = enc + raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ + @encoding.untaint + begin + require 'rexml/encodings/ICONV.rb' + Encoding.apply(self, "ICONV") rescue LoadError, Exception - begin - require 'rexml/encodings/ICONV.rb' - Encoding.apply(self, "ICONV") + begin + enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) + require enc_file + Encoding.apply(self, @encoding) rescue LoadError => err puts err.message raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." @@ -51,14 +51,14 @@ module REXML ensure $VERBOSE = old_verbosity end - true + true end def check_encoding str # We have to recognize UTF-16, LSB UTF-16, and UTF-8 return UTF_16 if /\A\xfe\xff/n =~ str return UNILE if /\A\xff\xfe/n =~ str - str =~ /^\s* # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl content + def entitydecl name, decl end # def notationdecl content diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index 3b6e813baf..2fee99c0e9 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -6,7 +6,7 @@ module REXML # Generates a Source object # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given - def SourceFactory::create_from arg#, slurp=true + def SourceFactory::create_from(arg) if arg.kind_of? String Source.new(arg) elsif arg.respond_to? :read and @@ -35,12 +35,19 @@ module REXML # Constructor # @param arg must be a String, and should be a valid XML document - def initialize(arg) + # @param encoding if non-null, sets the encoding of the source to this + # value, overriding all encoding detection + def initialize(arg, encoding=nil) @orig = @buffer = arg - self.encoding = check_encoding( @buffer ) + if encoding + self.encoding = encoding + else + self.encoding = check_encoding( @buffer ) + end @line = 0 end + # Inherited from Encoding # Overridden to support optimized en/decoding def encoding=(enc) @@ -124,7 +131,7 @@ module REXML #attr_reader :block_size # block_size has been deprecated - def initialize(arg, block_size=500) + def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false # Determining the encoding is a deceptively difficult issue to resolve. @@ -134,10 +141,12 @@ module REXML # if there is one. If there isn't one, the file MUST be UTF-8, as per # the XML spec. If there is one, we can determine the encoding from # it. + @buffer = "" str = @source.read( 2 ) - if /\A(?:\xfe\xff|\xff\xfe)/n =~ str + if encoding + self.encoding = encoding + elsif /\A(?:\xfe\xff|\xff\xfe)/n =~ str self.encoding = check_encoding( str ) - @line_break = encode( '>' ) else @line_break = '>' end @@ -159,6 +168,8 @@ module REXML str = @source.readline(@line_break) str = decode(str) if @to_utf and str @buffer << str + rescue Iconv::IllegalSequence + raise rescue @source = nil end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 55bc9f50f8..3de9170623 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -42,6 +42,7 @@ module REXML # Use this field if you have entities defined for some text, and you don't # want REXML to escape that text in output. # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" # Text.new( "<&", false, nil, true ) #-> Parse exception # Text.new( "<&", false, nil, true ) #-> "<&" # # Assume that the entity "s" is defined to be "sean" @@ -172,17 +173,6 @@ module REXML end @unnormalized = Text::unnormalize( @string, doctype ) end - - def wrap(string, width, addnewline=false) - # Recursivly wrap string at width. - return string if string.length <= width - place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - if addnewline then - return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) - else - return string[0,place] + "\n" + wrap(string[place+1..-1], width) - end - end # Sets the contents of this text node. This expects the text to be # unnormalized. It returns self. @@ -198,17 +188,28 @@ module REXML @raw = false end - def indent_text(string, level=1, style="\t", indentfirstline=true) - return string if level < 0 - new_string = '' - string.each { |line| - indent_string = style * level - new_line = (indent_string + line).sub(/[\s]+$/,'') - new_string << new_line - } - new_string.strip! unless indentfirstline - return new_string + def wrap(string, width, addnewline=false) + # Recursivly wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end end + + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end def write( writer, indent=-1, transitive=false, ie_hack=false ) s = to_s() @@ -286,9 +287,10 @@ module REXML def Text::normalize( input, doctype=nil, entity_filter=nil ) copy = input # Doing it like this rather than in a loop improves the speed + #copy = copy.gsub( EREFERENCE, '&' ) + copy = copy.gsub( "&", "&" ) if doctype # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) doctype.entities.each_value do |entity| copy = copy.gsub( entity.value, "&#{entity.name};" ) if entity.value and @@ -296,7 +298,6 @@ module REXML end else # Replace all ampersands that aren't part of an entity - copy = copy.gsub( EREFERENCE, '&' ) DocType::DEFAULT_ENTITIES.each_value do |entity| copy = copy.gsub(entity.value, "&#{entity.name};" ) end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index a813236e10..453d57a88d 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -160,8 +160,13 @@ module REXML node_types = ELEMENTS return nodeset if path_stack.length == 0 || nodeset.length == 0 while path_stack.length > 0 + #puts "#"*5 #puts "Path stack = #{path_stack.inspect}" #puts "Nodeset is #{nodeset.inspect}" + if nodeset.length == 0 + path_stack.clear + return [] + end case (op = path_stack.shift) when :document nodeset = [ nodeset[0].root_node ] @@ -235,9 +240,11 @@ module REXML name = path_stack.shift for element in nodeset if element.node_type == :element - #puts element.name - attr = element.attribute( name, get_namespace(element, prefix) ) - new_nodeset << attr if attr + #puts "Element name = #{element.name}" + #puts "get_namespace( #{element.inspect}, #{prefix} ) = #{get_namespace(element, prefix)}" + attrib = element.attribute( name, get_namespace(element, prefix) ) + #puts "attrib = #{attrib.inspect}" + new_nodeset << attrib if attrib end end when :any @@ -299,8 +306,10 @@ module REXML #puts "Adding node #{node.inspect}" if result == (index+1) new_nodeset << node if result == (index+1) elsif result.instance_of? Array - #puts "Adding node #{node.inspect}" if result.size > 0 - new_nodeset << node if result.size > 0 + if result.size > 0 and result.inject(false) {|k,s| s or k} + #puts "Adding node #{node.inspect}" if result.size > 0 + new_nodeset << node if result.size > 0 + end else #puts "Adding node #{node.inspect}" if result new_nodeset << node if result @@ -381,9 +390,25 @@ module REXML node_types = ELEMENTS when :namespace - new_set = [] + #puts "In :namespace" + new_nodeset = [] + prefix = path_stack.shift for node in nodeset - new_nodeset << node.namespace if node.node_type == :element or node.node_type == :attribute + if (node.node_type == :element or node.node_type == :attribute) + if @namespaces + namespaces = @namespaces + elsif (node.node_type == :element) + namespaces = node.namespaces + else + namespaces = node.element.namesapces + end + #puts "Namespaces = #{namespaces.inspect}" + #puts "Prefix = #{prefix.inspect}" + #puts "Node.namespace = #{node.namespace}" + if (node.namespace == namespaces[prefix]) + new_nodeset << node + end + end end nodeset = new_nodeset @@ -404,6 +429,18 @@ module REXML #puts "RES => #{res.inspect}" return res + when :and + left = expr( path_stack.shift, nodeset.dup, context ) + #puts "LEFT => #{left.inspect} (#{left.class.name})" + if left == false || left.nil? || !left.inject(false) {|a,b| a | b} + return [] + end + right = expr( path_stack.shift, nodeset.dup, context ) + #puts "RIGHT => #{right.inspect} (#{right.class.name})" + res = equality_relational_compare( left, op, right ) + #puts "RES => #{res.inspect}" + return res + when :div left = Functions::number(expr(path_stack.shift, nodeset, context)).to_f right = Functions::number(expr(path_stack.shift, nodeset, context)).to_f @@ -477,7 +514,7 @@ module REXML # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better # way of doing this, be my guest. This really sucks, but - # it took me three days to get it to work at all. + # it is a wonder it works at all. # ######################################################## def descendant_or_self( path_stack, nodeset ) -- cgit v1.2.3