diff options
Diffstat (limited to 'lib/rexml')
61 files changed, 3351 insertions, 3201 deletions
diff --git a/lib/rexml/attlistdecl.rb b/lib/rexml/attlistdecl.rb index ef4721b5ce..dc1d2add0b 100644 --- a/lib/rexml/attlistdecl.rb +++ b/lib/rexml/attlistdecl.rb @@ -1,62 +1,63 @@ +# frozen_string_literal: false #vim:ts=2 sw=2 noexpandtab: require 'rexml/child' require 'rexml/source' module REXML - # This class needs: - # * Documentation - # * Work! Not all types of attlists are intelligently parsed, so we just - # spew back out what we get in. This works, but it would be better if - # we formatted the output ourselves. - # - # AttlistDecls provide *just* enough support to allow namespace - # declarations. If you need some sort of generalized support, or have an - # interesting idea about how to map the hideous, terrible design of DTD - # AttlistDecls onto an intuitive Ruby interface, let me know. I'm desperate - # for anything to make DTDs more palateable. - class AttlistDecl < Child - include Enumerable - - # What is this? Got me. - attr_reader :element_name - - # Create an AttlistDecl, pulling the information from a Source. Notice - # that this isn't very convenient; to create an AttlistDecl, you basically - # have to format it yourself, and then have the initializer parse it. - # Sorry, but for the forseeable future, DTD support in REXML is pretty - # weak on convenience. Have I mentioned how much I hate DTDs? - def initialize(source) - super() - if (source.kind_of? Array) - @element_name, @pairs, @contents = *source - end - end - - # Access the attlist attribute/value pairs. - # value = attlist_decl[ attribute_name ] - def [](key) - @pairs[key] - end - - # Whether an attlist declaration includes the given attribute definition - # if attlist_decl.include? "xmlns:foobar" - def include?(key) - @pairs.keys.include? key - end - - # Iterate over the key/value pairs: - # attlist_decl.each { |attribute_name, attribute_value| ... } - def each(&block) - @pairs.each(&block) - end - - # Write out exactly what we got in. - def write out, indent=-1 - out << @contents - end - - def node_type - :attlistdecl - end - end + # This class needs: + # * Documentation + # * Work! Not all types of attlists are intelligently parsed, so we just + # spew back out what we get in. This works, but it would be better if + # we formatted the output ourselves. + # + # AttlistDecls provide *just* enough support to allow namespace + # declarations. If you need some sort of generalized support, or have an + # interesting idea about how to map the hideous, terrible design of DTD + # AttlistDecls onto an intuitive Ruby interface, let me know. I'm desperate + # for anything to make DTDs more palateable. + class AttlistDecl < Child + include Enumerable + + # What is this? Got me. + attr_reader :element_name + + # Create an AttlistDecl, pulling the information from a Source. Notice + # that this isn't very convenient; to create an AttlistDecl, you basically + # have to format it yourself, and then have the initializer parse it. + # Sorry, but for the foreseeable future, DTD support in REXML is pretty + # weak on convenience. Have I mentioned how much I hate DTDs? + def initialize(source) + super() + if (source.kind_of? Array) + @element_name, @pairs, @contents = *source + end + end + + # Access the attlist attribute/value pairs. + # value = attlist_decl[ attribute_name ] + def [](key) + @pairs[key] + end + + # Whether an attlist declaration includes the given attribute definition + # if attlist_decl.include? "xmlns:foobar" + def include?(key) + @pairs.keys.include? key + end + + # Iterate over the key/value pairs: + # attlist_decl.each { |attribute_name, attribute_value| ... } + def each(&block) + @pairs.each(&block) + end + + # Write out exactly what we got in. + def write out, indent=-1 + out << @contents + end + + def node_type + :attlistdecl + end + end end diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb index 89c1ada36c..ca5984e178 100644 --- a/lib/rexml/attribute.rb +++ b/lib/rexml/attribute.rb @@ -1,169 +1,176 @@ +# frozen_string_literal: false require "rexml/namespace" require 'rexml/text' module REXML - # Defines an Element Attribute; IE, a attribute=value pair, as in: - # <element attribute="value"/>. Attributes can be in their own - # namespaces. General users of REXML will not interact with the - # Attribute class much. - class Attribute - include Node - include Namespace - - # The element to which this attribute belongs - attr_reader :element - # The normalized value of this attribute. That is, the attribute with - # entities intact. - attr_writer :normalized - PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um - - # Constructor. + # Defines an Element Attribute; IE, a attribute=value pair, as in: + # <element attribute="value"/>. Attributes can be in their own + # namespaces. General users of REXML will not interact with the + # Attribute class much. + class Attribute + include Node + include Namespace + + # The element to which this attribute belongs + attr_reader :element + # The normalized value of this attribute. That is, the attribute with + # entities intact. + attr_writer :normalized + PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + + # Constructor. # FIXME: The parser doesn't catch illegal characters in attributes # - # first:: + # first:: # Either: an Attribute, which this new attribute will become a # clone of; or a String, which is the name of this attribute # second:: # If +first+ is an Attribute, then this may be an Element, or nil. # If nil, then the Element parent of this attribute is the parent - # of the +first+ Attribute. If the first argument is a String, - # then this must also be a String, and is the content of the attribute. + # of the +first+ Attribute. If the first argument is a String, + # then this must also be a String, and is the content of the attribute. # If this is the content, it must be fully normalized (contain no # illegal characters). # parent:: - # Ignored unless +first+ is a String; otherwise, may be the Element + # Ignored unless +first+ is a String; otherwise, may be the Element # parent of this attribute, or nil. # - # - # Attribute.new( attribute_to_clone ) - # Attribute.new( attribute_to_clone, parent_element ) - # Attribute.new( "attr", "attr_value" ) - # Attribute.new( "attr", "attr_value", parent_element ) - def initialize( first, second=nil, parent=nil ) - @normalized = @unnormalized = @element = nil - if first.kind_of? Attribute - self.name = first.expanded_name - @unnormalized = first.value - if second.kind_of? Element - @element = second - else - @element = first.element - end - elsif first.kind_of? String - @element = parent - self.name = first - @normalized = second.to_s - else - raise "illegal argument #{first.class.name} to Attribute constructor" - end - end - - # Returns the namespace of the attribute. - # - # e = Element.new( "elns:myelement" ) - # e.add_attribute( "nsa:a", "aval" ) - # e.add_attribute( "b", "bval" ) - # e.attributes.get_attribute( "a" ).prefix # -> "nsa" - # e.attributes.get_attribute( "b" ).prefix # -> "elns" - # a = Attribute.new( "x", "y" ) - # a.prefix # -> "" - def prefix - pf = super - if pf == "" - pf = @element.prefix if @element - end - pf - end - - # Returns the namespace URL, if defined, or nil otherwise - # - # e = Element.new("el") - # e.add_attributes({"xmlns:ns", "http://url"}) - # e.namespace( "ns" ) # -> "http://url" - def namespace arg=nil - arg = prefix if arg.nil? - @element.namespace arg - end - - # Returns true if other is an Attribute and has the same name and value, - # false otherwise. - def ==( other ) - other.kind_of?(Attribute) and other.name==name and other.value==value - end - - # Creates (and returns) a hash from both the name and value - def hash - name.hash + value.hash - end - - # Returns this attribute out as XML source, expanding the name - # - # a = Attribute.new( "x", "y" ) - # a.to_string # -> "x='y'" - # b = Attribute.new( "ns:x", "y" ) - # b.to_string # -> "ns:x='y'" - def to_string - if @element and @element.context and @element.context[:attribute_quote] == :quote - %Q^#@expanded_name="#{to_s().gsub(/"/, '"e;')}"^ - else - "#@expanded_name='#{to_s().gsub(/'/, ''')}'" - end - end - - # Returns the attribute value, with entities replaced - def to_s - return @normalized if @normalized - - doctype = nil - if @element - doc = @element.document - doctype = doc.doctype if doc - end - - @normalized = Text::normalize( @unnormalized, doctype ) - @unnormalized = nil + # + # Attribute.new( attribute_to_clone ) + # Attribute.new( attribute_to_clone, parent_element ) + # Attribute.new( "attr", "attr_value" ) + # Attribute.new( "attr", "attr_value", parent_element ) + def initialize( first, second=nil, parent=nil ) + @normalized = @unnormalized = @element = nil + if first.kind_of? Attribute + self.name = first.expanded_name + @unnormalized = first.value + if second.kind_of? Element + @element = second + else + @element = first.element + end + elsif first.kind_of? String + @element = parent + self.name = first + @normalized = second.to_s + else + raise "illegal argument #{first.class.name} to Attribute constructor" + end + end + + # Returns the namespace of the attribute. + # + # e = Element.new( "elns:myelement" ) + # e.add_attribute( "nsa:a", "aval" ) + # e.add_attribute( "b", "bval" ) + # e.attributes.get_attribute( "a" ).prefix # -> "nsa" + # e.attributes.get_attribute( "b" ).prefix # -> "elns" + # a = Attribute.new( "x", "y" ) + # a.prefix # -> "" + def prefix + pf = super + if pf == "" + pf = @element.prefix if @element + end + pf + end + + # Returns the namespace URL, if defined, or nil otherwise + # + # e = Element.new("el") + # e.add_namespace("ns", "http://url") + # e.add_attribute("ns:a", "b") + # e.add_attribute("nsx:a", "c") + # e.attribute("ns:a").namespace # => "http://url" + # e.attribute("nsx:a").namespace # => nil + def namespace arg=nil + arg = prefix if arg.nil? + @element.namespace arg + end + + # Returns true if other is an Attribute and has the same name and value, + # false otherwise. + def ==( other ) + other.kind_of?(Attribute) and other.name==name and other.value==value + end + + # Creates (and returns) a hash from both the name and value + def hash + name.hash + value.hash + end + + # Returns this attribute out as XML source, expanding the name + # + # a = Attribute.new( "x", "y" ) + # a.to_string # -> "x='y'" + # b = Attribute.new( "ns:x", "y" ) + # b.to_string # -> "ns:x='y'" + def to_string + if @element and @element.context and @element.context[:attribute_quote] == :quote + %Q^#@expanded_name="#{to_s().gsub(/"/, '"')}"^ + else + "#@expanded_name='#{to_s().gsub(/'/, ''')}'" + end + end + + def doctype + if @element + doc = @element.document + doc.doctype if doc + end + end + + # Returns the attribute value, with entities replaced + def to_s + return @normalized if @normalized + + @normalized = Text::normalize( @unnormalized, doctype ) + @unnormalized = nil @normalized - end - - # Returns the UNNORMALIZED value of this attribute. That is, entities - # have been expanded to their values - def value - return @unnormalized if @unnormalized - doctype = nil - if @element - doc = @element.document - doctype = doc.doctype if doc - end - @unnormalized = Text::unnormalize( @normalized, doctype ) - @normalized = nil + end + + # Returns the UNNORMALIZED value of this attribute. That is, entities + # have been expanded to their values + def value + return @unnormalized if @unnormalized + @unnormalized = Text::unnormalize( @normalized, doctype ) + @normalized = nil @unnormalized - end - - # Returns a copy of this attribute - def clone - Attribute.new self - end - - # Sets the element of which this object is an attribute. Normally, this - # is not directly called. - # - # Returns this attribute - def element=( element ) - @element = element - self - end - - # Removes this Attribute from the tree, and returns true if successfull - # - # This method is usually not called directly. - def remove - @element.attributes.delete self.name unless @element.nil? - end - - # Writes this attribute (EG, puts 'key="value"' to the output) - def write( output, indent=-1 ) - output << to_string - end + end + + # Returns a copy of this attribute + def clone + Attribute.new self + end + + # Sets the element of which this object is an attribute. Normally, this + # is not directly called. + # + # Returns this attribute + def element=( element ) + @element = element + + if @normalized + Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype ) + end + + self + end + + # Removes this Attribute from the tree, and returns true if successful + # + # This method is usually not called directly. + def remove + @element.attributes.delete self.name unless @element.nil? + end + + # Writes this attribute (EG, puts 'key="value"' to the output) + def write( output, indent=-1 ) + output << to_string + end def node_type :attribute @@ -180,6 +187,6 @@ module REXML path += "/@#{self.expanded_name}" return path end - end + end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb index efcb71160a..2238446dc4 100644 --- a/lib/rexml/cdata.rb +++ b/lib/rexml/cdata.rb @@ -1,39 +1,40 @@ +# frozen_string_literal: false require "rexml/text" module REXML - class CData < Text - START = '<![CDATA[' - STOP = ']]>' - ILLEGAL = /(\]\]>)/ + class CData < Text + START = '<![CDATA[' + STOP = ']]>' + ILLEGAL = /(\]\]>)/ - # Constructor. CData is data between <![CDATA[ ... ]]> - # - # _Examples_ - # CData.new( source ) - # CData.new( "Here is some CDATA" ) - # CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element ) - def initialize( first, whitespace=true, parent=nil ) - super( first, whitespace, parent, true, true, ILLEGAL ) - end + # Constructor. CData is data between <![CDATA[ ... ]]> + # + # _Examples_ + # CData.new( source ) + # CData.new( "Here is some CDATA" ) + # CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element ) + def initialize( first, whitespace=true, parent=nil ) + super( first, whitespace, parent, false, true, ILLEGAL ) + end - # Make a copy of this object - # - # _Examples_ - # c = CData.new( "Some text" ) - # d = c.clone - # d.to_s # -> "Some text" - def clone - CData.new self - end + # Make a copy of this object + # + # _Examples_ + # c = CData.new( "Some text" ) + # d = c.clone + # d.to_s # -> "Some text" + def clone + CData.new self + end - # Returns the content of this CData object - # - # _Examples_ - # c = CData.new( "Some text" ) - # c.to_s # -> "Some text" - def to_s - @string - end + # Returns the content of this CData object + # + # _Examples_ + # c = CData.new( "Some text" ) + # c.to_s # -> "Some text" + def to_s + @string + end def value @string @@ -42,26 +43,26 @@ module REXML # == DEPRECATED # See the rexml/formatters package # - # Generates XML output of this object - # - # output:: - # Where to write the string. Defaults to $stdout - # indent:: + # Generates XML output of this object + # + # output:: + # Where to write the string. Defaults to $stdout + # indent:: # The amount to indent this node by - # transitive:: + # transitive:: # Ignored - # ie_hack:: + # ie_hack:: # Ignored - # - # _Examples_ - # c = CData.new( " Some text " ) - # c.write( $stdout ) #-> <![CDATA[ Some text ]]> - def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn( "#{self.class.name}.write is deprecated" ) - indent( output, indent ) - output << START - output << @string - output << STOP - end - end + # + # _Examples_ + # c = CData.new( " Some text " ) + # c.write( $stdout ) #-> <![CDATA[ Some text ]]> + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + indent( output, indent ) + output << START + output << @string + output << STOP + end + end end diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb index 6d3c9df5e6..d23451e71e 100644 --- a/lib/rexml/child.rb +++ b/lib/rexml/child.rb @@ -1,96 +1,97 @@ +# frozen_string_literal: false require "rexml/node" module REXML - ## - # A Child object is something contained by a parent, and this class - # contains methods to support that. Most user code will not use this - # class directly. - class Child - include Node - attr_reader :parent # The Parent of this object + ## + # A Child object is something contained by a parent, and this class + # contains methods to support that. Most user code will not use this + # class directly. + class Child + include Node + attr_reader :parent # The Parent of this object - # Constructor. Any inheritors of this class should call super to make - # sure this method is called. - # parent:: - # if supplied, the parent of this child will be set to the - # supplied value, and self will be added to the parent - def initialize( parent = nil ) - @parent = nil - # Declare @parent, but don't define it. The next line sets the - # parent. - parent.add( self ) if parent - end + # Constructor. Any inheritors of this class should call super to make + # sure this method is called. + # parent:: + # if supplied, the parent of this child will be set to the + # supplied value, and self will be added to the parent + def initialize( parent = nil ) + @parent = nil + # Declare @parent, but don't define it. The next line sets the + # parent. + parent.add( self ) if parent + end - # Replaces this object with another object. Basically, calls - # Parent.replace_child - # - # Returns:: self - def replace_with( child ) - @parent.replace_child( self, child ) - self - end + # Replaces this object with another object. Basically, calls + # Parent.replace_child + # + # Returns:: self + def replace_with( child ) + @parent.replace_child( self, child ) + self + end - # Removes this child from the parent. - # - # Returns:: self - def remove - unless @parent.nil? - @parent.delete self - end - self - end + # Removes this child from the parent. + # + # Returns:: self + def remove + unless @parent.nil? + @parent.delete self + end + self + end - # Sets the parent of this child to the supplied argument. - # - # other:: - # Must be a Parent object. If this object is the same object as the - # existing parent of this child, no action is taken. Otherwise, this - # child is removed from the current parent (if one exists), and is added - # to the new parent. - # Returns:: The parent added - def parent=( other ) - return @parent if @parent == other - @parent.delete self if defined? @parent and @parent - @parent = other - end + # Sets the parent of this child to the supplied argument. + # + # other:: + # Must be a Parent object. If this object is the same object as the + # existing parent of this child, no action is taken. Otherwise, this + # child is removed from the current parent (if one exists), and is added + # to the new parent. + # Returns:: The parent added + def parent=( other ) + return @parent if @parent == other + @parent.delete self if defined? @parent and @parent + @parent = other + end - alias :next_sibling :next_sibling_node - alias :previous_sibling :previous_sibling_node + alias :next_sibling :next_sibling_node + alias :previous_sibling :previous_sibling_node - # Sets the next sibling of this child. This can be used to insert a child - # after some other child. - # a = Element.new("a") - # b = a.add_element("b") - # c = Element.new("c") - # b.next_sibling = c - # # => <a><b/><c/></a> - def next_sibling=( other ) - parent.insert_after self, other - end + # Sets the next sibling of this child. This can be used to insert a child + # after some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.next_sibling = c + # # => <a><b/><c/></a> + def next_sibling=( other ) + parent.insert_after self, other + end - # Sets the previous sibling of this child. This can be used to insert a - # child before some other child. - # a = Element.new("a") - # b = a.add_element("b") - # c = Element.new("c") - # b.previous_sibling = c - # # => <a><b/><c/></a> - def previous_sibling=(other) - parent.insert_before self, other - end + # Sets the previous sibling of this child. This can be used to insert a + # child before some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.previous_sibling = c + # # => <a><b/><c/></a> + def previous_sibling=(other) + parent.insert_before self, other + end - # Returns:: the document this child belongs to, or nil if this child - # belongs to no document - def document - return parent.document unless parent.nil? - nil - end + # Returns:: the document this child belongs to, or nil if this child + # belongs to no document + def document + return parent.document unless parent.nil? + nil + end - # This doesn't yet handle encodings - def bytes - encoding = document.encoding + # This doesn't yet handle encodings + def bytes + document.encoding - to_s - end - end + to_s + end + end end diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb index 2b9b4b89c9..822fe0d586 100644 --- a/lib/rexml/comment.rb +++ b/lib/rexml/comment.rb @@ -1,80 +1,80 @@ +# frozen_string_literal: false require "rexml/child" module REXML - ## - # Represents an XML comment; that is, text between \<!-- ... --> - class Comment < Child - include Comparable - START = "<!--" - STOP = "-->" + ## + # Represents an XML comment; that is, text between \<!-- ... --> + class Comment < Child + include Comparable + START = "<!--" + STOP = "-->" - # The content text + # The content text - attr_accessor :string + attr_accessor :string - ## - # Constructor. The first argument can be one of three types: - # @param first If String, the contents of this comment are set to the - # argument. If Comment, the argument is duplicated. If - # Source, the argument is scanned for a comment. - # @param second If the first argument is a Source, this argument - # should be nil, not supplied, or a Parent to be set as the parent - # of this object - def initialize( first, second = nil ) - #puts "IN COMMENT CONSTRUCTOR; SECOND IS #{second.type}" - super(second) - if first.kind_of? String - @string = first - elsif first.kind_of? Comment - @string = first.string - end - end + ## + # Constructor. The first argument can be one of three types: + # @param first If String, the contents of this comment are set to the + # argument. If Comment, the argument is duplicated. If + # Source, the argument is scanned for a comment. + # @param second If the first argument is a Source, this argument + # should be nil, not supplied, or a Parent to be set as the parent + # of this object + def initialize( first, second = nil ) + super(second) + if first.kind_of? String + @string = first + elsif first.kind_of? Comment + @string = first.string + end + end - def clone - Comment.new self - end + def clone + Comment.new self + end # == DEPRECATED # See REXML::Formatters # - # output:: - # Where to write the string - # indent:: - # An integer. If -1, no indenting will be used; otherwise, the - # indentation will be this number of spaces, and children will be - # indented an additional amount. - # transitive:: - # Ignored by this class. The contents of comments are never modified. - # ie_hack:: - # Needed for conformity to the child API, but not used by this class. - def write( output, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("Comment.write is deprecated. See REXML::Formatters") - indent( output, indent ) - output << START - output << @string - output << STOP - end + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # Ignored by this class. The contents of comments are never modified. + # ie_hack:: + # Needed for conformity to the child API, but not used by this class. + def write( output, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn("Comment.write is deprecated. See REXML::Formatters", uplevel: 1) + indent( output, indent ) + output << START + output << @string + output << STOP + end - alias :to_s :string + alias :to_s :string - ## - # Compares this Comment to another; the contents of the comment are used - # in the comparison. - def <=>(other) - other.to_s <=> @string - end + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def <=>(other) + other.to_s <=> @string + end - ## - # Compares this Comment to another; the contents of the comment are used - # in the comparison. - def ==( other ) - other.kind_of? Comment and - (other <=> self) == 0 - end + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def ==( other ) + other.kind_of? Comment and + (other <=> self) == 0 + end def node_type :comment end - end + end end #vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index 05cd4ab331..cb9bf57406 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require "rexml/parent" require "rexml/parseexception" require "rexml/namespace" @@ -6,6 +7,39 @@ require 'rexml/attlistdecl' require 'rexml/xmltokens' module REXML + class ReferenceWriter + def initialize(id_type, + public_id_literal, + system_literal) + @id_type = id_type + @public_id_literal = public_id_literal + @system_literal = system_literal + @default_quote = "\"" + end + + def write(output) + output << " #{@id_type}" + if @public_id_literal + if @public_id_literal.include?("'") + quote = "\"" + else + quote = @default_quote + end + output << " #{quote}#{@public_id_literal}#{quote}" + end + if @system_literal + if @system_literal.include?("'") + quote = "\"" + elsif @system_literal.include?("\"") + quote = "'" + else + quote = @default_quote + end + output << " #{quote}#{@system_literal}#{quote}" + end + end + end + # Represents an XML DOCTYPE declaration; that is, the contents of <!DOCTYPE # ... >. DOCTYPES can be used to declare the DTD of a document, as well as # being used to declare entities used in the document. @@ -15,11 +49,11 @@ module REXML STOP = ">" SYSTEM = "SYSTEM" PUBLIC = "PUBLIC" - DEFAULT_ENTITIES = { - 'gt'=>EntityConst::GT, - 'lt'=>EntityConst::LT, - 'quot'=>EntityConst::QUOT, - "apos"=>EntityConst::APOS + DEFAULT_ENTITIES = { + 'gt'=>EntityConst::GT, + 'lt'=>EntityConst::LT, + 'quot'=>EntityConst::QUOT, + "apos"=>EntityConst::APOS } # name is the name of the doctype @@ -33,7 +67,7 @@ module REXML # dt = DocType.new( doctype_to_clone ) # # Incomplete. Shallow clone of doctype # - # +Note+ that the constructor: + # +Note+ that the constructor: # # Doctype.new( Source.new( "<!DOCTYPE foo 'bar'>" ) ) # @@ -49,6 +83,8 @@ module REXML super( parent ) @name = first.name @external_id = first.external_id + @long_name = first.instance_variable_get(:@long_name) + @uri = first.instance_variable_get(:@uri) elsif first.kind_of? Array super( parent ) @name = first[0] @@ -111,13 +147,14 @@ module REXML output << START output << ' ' output << @name - output << " #@external_id" if @external_id - output << " #{@long_name.inspect}" if @long_name - output << " #{@uri.inspect}" if @uri + if @external_id + reference_writer = ReferenceWriter.new(@external_id, + @long_name, + @uri) + reference_writer.write(output) + end unless @children.empty? - next_indent = indent + 1 output << ' [' - child = nil # speed @children.each { |child| output << "\n" f.write( child, output ) @@ -140,8 +177,8 @@ module REXML @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES @entities[ child.name ] = child if child.kind_of? Entity end - - # This method retrieves the public identifier identifying the document's + + # This method retrieves the public identifier identifying the document's # DTD. # # Method contributed by Henrik Martensson @@ -153,7 +190,7 @@ module REXML strip_quotes(@long_name) end end - + # This method retrieves the system identifier identifying the document's DTD # # Method contributed by Henrik Martensson @@ -165,16 +202,16 @@ module REXML @uri.kind_of?(String) ? strip_quotes(@uri) : nil end end - + # This method returns a list of notations that have been declared in the - # _internal_ DTD subset. Notations in the external DTD subset are not + # _internal_ DTD subset. Notations in the external DTD subset are not # listed. # # Method contributed by Henrik Martensson def notations children().select {|node| node.kind_of?(REXML::NotationDecl)} end - + # Retrieves a named notation. Only notations declared in the internal # DTD subset can be retrieved. # @@ -184,12 +221,12 @@ module REXML notation_decl.name == name } end - + private - + # Method contributed by Henrik Martensson def strip_quotes(quoted_string) - quoted_string =~ /^[\'\"].*[\´\"]$/ ? + quoted_string =~ /^[\'\"].*[\'\"]$/ ? quoted_string[1, quoted_string.length-2] : quoted_string end @@ -218,7 +255,7 @@ module REXML output << to_s end end - + public class ElementDecl < Declaration def initialize( src ) @@ -250,17 +287,17 @@ module REXML end def to_s - "<!NOTATION #@name #@middle#{ - @public ? ' ' + public.inspect : '' - }#{ - @system ? ' ' +@system.inspect : '' - }>" + notation = "<!NOTATION #{@name}" + reference_writer = ReferenceWriter.new(@middle, @public, @system) + reference_writer.write(notation) + notation << ">" + notation end def write( output, indent=-1 ) output << to_s end - + # This method retrieves the name of the notation. # # Method contributed by Henrik Martensson diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb index 54aa691ad8..806bc499cd 100644 --- a/lib/rexml/document.rb +++ b/lib/rexml/document.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: false +require "rexml/security" require "rexml/element" require "rexml/xmldecl" require "rexml/source" @@ -16,130 +18,138 @@ module REXML # Document has a single child that can be accessed by root(). # Note that if you want to have an XML declaration written for a document # you create, you must add one; REXML documents do not write a default - # declaration for you. See |DECLARATION| and |write|. - class Document < Element - # A convenient default XML declaration. If you want an XML declaration, - # the easiest way to add one is mydoc << Document::DECLARATION + # declaration for you. See |DECLARATION| and |write|. + class Document < Element + # A convenient default XML declaration. If you want an XML declaration, + # the easiest way to add one is mydoc << Document::DECLARATION # +DEPRECATED+ # Use: mydoc << XMLDecl.default - DECLARATION = XMLDecl.default - - # Constructor - # @param source if supplied, must be a Document, String, or IO. - # Documents have their context and Element attributes cloned. - # Strings are expected to be valid XML documents. IOs are expected - # to be sources of valid XML documents. - # @param context if supplied, contains the context of the document; - # this should be a Hash. - def initialize( source = nil, context = {} ) - super() - @context = context - return if source.nil? - if source.kind_of? Document - @context = source.context - super source - else - build( source ) - end - end + DECLARATION = XMLDecl.default + + # Constructor + # @param source if supplied, must be a Document, String, or IO. + # Documents have their context and Element attributes cloned. + # Strings are expected to be valid XML documents. IOs are expected + # to be sources of valid XML documents. + # @param context if supplied, contains the context of the document; + # this should be a Hash. + def initialize( source = nil, context = {} ) + @entity_expansion_count = 0 + super() + @context = context + return if source.nil? + if source.kind_of? Document + @context = source.context + super source + else + build( source ) + end + end def node_type :document end - # Should be obvious - def clone - Document.new self - end + # Should be obvious + def clone + Document.new self + end - # According to the XML spec, a root node has no expanded name - def expanded_name - '' - #d = doc_type - #d ? d.name : "UNDEFINED" - end + # According to the XML spec, a root node has no expanded name + def expanded_name + '' + #d = doc_type + #d ? d.name : "UNDEFINED" + end - alias :name :expanded_name + alias :name :expanded_name - # We override this, because XMLDecls and DocTypes must go at the start - # of the document - def add( child ) - if child.kind_of? XMLDecl - @children.unshift child + # We override this, because XMLDecls and DocTypes must go at the start + # of the document + def add( child ) + if child.kind_of? XMLDecl + if @children[0].kind_of? XMLDecl + @children[0] = child + else + @children.unshift child + end child.parent = self - elsif child.kind_of? DocType - # Find first Element or DocType node and insert the decl right + elsif child.kind_of? DocType + # Find first Element or DocType node and insert the decl right # before it. If there is no such node, just insert the child at the # end. If there is a child and it is an DocType, then replace it. - insert_before_index = 0 - @children.find { |x| - insert_before_index += 1 + insert_before_index = @children.find_index { |x| x.kind_of?(Element) || x.kind_of?(DocType) } - if @children[ insert_before_index ] # Not null = not end of list - if @children[ insert_before_index ].kind_of DocType + if insert_before_index # Not null = not end of list + if @children[ insert_before_index ].kind_of? DocType @children[ insert_before_index ] = child else - @children[ index_before_index-1, 0 ] = child + @children[ insert_before_index-1, 0 ] = child end else # Insert at end of list - @children[insert_before_index] = child + @children << child end - child.parent = self - else - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end - end - alias :<< :add - - def add_element(arg=nil, arg2=nil) - rv = super - raise "attempted adding second root element to document" if @elements.size > 1 - rv - end - - # @return the root Element of the document, or nil if this document - # has no children. - def root + child.parent = self + else + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + end + alias :<< :add + + def add_element(arg=nil, arg2=nil) + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + + # @return the root Element of the document, or nil if this document + # has no children. + def root elements[1] #self #@children.find { |item| item.kind_of? Element } - end - - # @return the DocType child of the document, if one exists, - # and nil otherwise. - def doctype - @children.find { |item| item.kind_of? DocType } - end - - # @return the XMLDecl of this document; if no XMLDecl has been - # set, the default declaration is returned. - def xml_decl - rv = @children[0] + end + + # @return the DocType child of the document, if one exists, + # and nil otherwise. + def doctype + @children.find { |item| item.kind_of? DocType } + end + + # @return the XMLDecl of this document; if no XMLDecl has been + # set, the default declaration is returned. + def xml_decl + rv = @children[0] return rv if rv.kind_of? XMLDecl - rv = @children.unshift(XMLDecl.default)[0] - end - - # @return the XMLDecl version of this document as a String. - # If no XMLDecl has been set, returns the default version. - def version - xml_decl().version - end - - # @return the XMLDecl encoding of this document as a String. - # If no XMLDecl has been set, returns the default encoding. - def encoding - xml_decl().encoding - end - - # @return the XMLDecl standalone value of this document as a String. - # If no XMLDecl has been set, returns the default setting. - def stand_alone? - xml_decl().stand_alone? - end + @children.unshift(XMLDecl.default)[0] + end + + # @return the XMLDecl version of this document as a String. + # If no XMLDecl has been set, returns the default version. + def version + xml_decl().version + end + + # @return the XMLDecl encoding of this document as an + # Encoding object. + # If no XMLDecl has been set, returns the default encoding. + def encoding + xml_decl().encoding + end + + # @return the XMLDecl standalone value of this document as a String. + # If no XMLDecl has been set, returns the default setting. + def stand_alone? + xml_decl().stand_alone? + end + # :call-seq: + # doc.write(output=$stdout, indent=-1, transtive=false, ie_hack=false, encoding=nil) + # doc.write(options={:output => $stdout, :indent => -1, :transtive => false, :ie_hack => false, :encoding => nil}) + # # Write the XML tree out, optionally with indent. This writes out the # entire XML document, including XML declarations, doctype declarations, # and processing instructions (if any are given). @@ -150,41 +160,73 @@ module REXML # specified, because it adds unnecessary bandwidth to applications such # as XML-RPC. # + # Accept Nth argument style and options Hash style as argument. + # The recommended style is options Hash style for one or more + # arguments case. + # + # _Examples_ + # Document.new("<a><b/></a>").write + # + # output = "" + # Document.new("<a><b/></a>").write(output) + # + # output = "" + # Document.new("<a><b/></a>").write(:output => output, :indent => 2) + # # See also the classes in the rexml/formatters package for the proper way - # to change the default formatting of XML output + # to change the default formatting of XML output. # # _Examples_ - # Document.new("<a><b/></a>").serialize # - # output_string = "" - # tr = Transitive.new( output_string ) - # Document.new("<a><b/></a>").serialize( tr ) + # output = "" + # tr = Transitive.new + # tr.write(Document.new("<a><b/></a>"), output) # # output:: - # output an object which supports '<< string'; this is where the + # output an object which supports '<< string'; this is where the # document will be written. # indent:: # An integer. If -1, no indenting will be used; otherwise, the # indentation will be twice this number of spaces, and children will be - # indented an additional amount. For a value of 3, every item will be + # indented an additional amount. For a value of 3, every item will be # indented 3 more levels, or 6 more spaces (2 * 3). Defaults to -1 - # trans:: + # transitive:: # If transitive is true and indent is >= 0, then the output will be # pretty-printed in such a way that the added whitespace does not affect # the absolute *value* of the document -- that is, it leaves the value # and number of Text nodes in the document unchanged. # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. Defaults to false - def write( output=$stdout, indent=-1, trans=false, ie_hack=false ) - if xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) - output = Output.new( output, xml_decl.encoding ) + # This hack inserts a space before the /> on empty tags to address + # a limitation of Internet Explorer. Defaults to false + # encoding:: + # Encoding name as String. Change output encoding to specified encoding + # instead of encoding in XML declaration. + # Defaults to nil. It means encoding in XML declaration is used. + def write(*arguments) + if arguments.size == 1 and arguments[0].class == Hash + options = arguments[0] + + output = options[:output] + indent = options[:indent] + transitive = options[:transitive] + ie_hack = options[:ie_hack] + encoding = options[:encoding] + else + output, indent, transitive, ie_hack, encoding, = *arguments + end + + output ||= $stdout + indent ||= -1 + transitive = false if transitive.nil? + ie_hack = false if ie_hack.nil? + encoding ||= xml_decl.encoding + + if encoding != 'UTF-8' && !output.kind_of?(Output) + output = Output.new( output, encoding ) end formatter = if indent > -1 - if trans + if transitive + require "rexml/formatters/transitive" REXML::Formatters::Transitive.new( indent, ie_hack ) else REXML::Formatters::Pretty.new( indent, ie_hack ) @@ -193,16 +235,57 @@ module REXML REXML::Formatters::Default.new( ie_hack ) end formatter.write( self, output ) - end + end + + + def Document::parse_stream( source, listener ) + Parsers::StreamParser.new( source, listener ).parse + end + + # Set the entity expansion limit. By default the limit is set to 10000. + # + # Deprecated. Use REXML::Security.entity_expansion_limit= instead. + def Document::entity_expansion_limit=( val ) + Security.entity_expansion_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10000. + # + # Deprecated. Use REXML::Security.entity_expansion_limit= instead. + def Document::entity_expansion_limit + return Security.entity_expansion_limit + end + + # Set the entity expansion limit. By default the limit is set to 10240. + # + # Deprecated. Use REXML::Security.entity_expansion_text_limit= instead. + def Document::entity_expansion_text_limit=( val ) + Security.entity_expansion_text_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10240. + # + # Deprecated. Use REXML::Security.entity_expansion_text_limit instead. + def Document::entity_expansion_text_limit + return Security.entity_expansion_text_limit + end + + attr_reader :entity_expansion_count - - def Document::parse_stream( source, listener ) - Parsers::StreamParser.new( source, listener ).parse - end + def record_entity_expansion + @entity_expansion_count += 1 + if @entity_expansion_count > Security.entity_expansion_limit + raise "number of entity expansions exceeded, processing aborted." + end + end - private - def build( source ) + def document + self + end + + private + def build( source ) Parsers::TreeParser.new( source, self ).parse - end - end + end + end end diff --git a/lib/rexml/dtd/attlistdecl.rb b/lib/rexml/dtd/attlistdecl.rb index e176bb0749..32847daadb 100644 --- a/lib/rexml/dtd/attlistdecl.rb +++ b/lib/rexml/dtd/attlistdecl.rb @@ -1,10 +1,11 @@ +# frozen_string_literal: false require "rexml/child" module REXML - module DTD - class AttlistDecl < Child - START = "<!ATTLIST" - START_RE = /^\s*#{START}/um - PATTERN_RE = /\s*(#{START}.*?>)/um - end - end + module DTD + class AttlistDecl < Child + START = "<!ATTLIST" + START_RE = /^\s*#{START}/um + PATTERN_RE = /\s*(#{START}.*?>)/um + end + end end diff --git a/lib/rexml/dtd/dtd.rb b/lib/rexml/dtd/dtd.rb index 4f735d4812..927d5d847b 100644 --- a/lib/rexml/dtd/dtd.rb +++ b/lib/rexml/dtd/dtd.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require "rexml/dtd/elementdecl" require "rexml/dtd/entitydecl" require "rexml/comment" @@ -6,46 +7,41 @@ require "rexml/dtd/attlistdecl" require "rexml/parent" module REXML - module DTD - class Parser - def Parser.parse( input ) - case input - when String - parse_helper input - when File - parse_helper input.read - end - end + module DTD + class Parser + def Parser.parse( input ) + case input + when String + parse_helper input + when File + parse_helper input.read + end + end - # Takes a String and parses it out - def Parser.parse_helper( input ) - contents = Parent.new - while input.size > 0 - case input - when ElementDecl.PATTERN_RE - match = $& - source = $' - contents << ElementDecl.new( match ) - when AttlistDecl.PATTERN_RE - matchdata = $~ - source = $' - contents << AttlistDecl.new( matchdata ) - when EntityDecl.PATTERN_RE - matchdata = $~ - source = $' - contents << EntityDecl.new( matchdata ) - when Comment.PATTERN_RE - matchdata = $~ - source = $' - contents << Comment.new( matchdata ) - when NotationDecl.PATTERN_RE - matchdata = $~ - source = $' - contents << NotationDecl.new( matchdata ) - end - end - contents - end - end - end + # Takes a String and parses it out + def Parser.parse_helper( input ) + contents = Parent.new + while input.size > 0 + case input + when ElementDecl.PATTERN_RE + match = $& + contents << ElementDecl.new( match ) + when AttlistDecl.PATTERN_RE + matchdata = $~ + contents << AttlistDecl.new( matchdata ) + when EntityDecl.PATTERN_RE + matchdata = $~ + contents << EntityDecl.new( matchdata ) + when Comment.PATTERN_RE + matchdata = $~ + contents << Comment.new( matchdata ) + when NotationDecl.PATTERN_RE + matchdata = $~ + contents << NotationDecl.new( matchdata ) + end + end + contents + end + end + end end diff --git a/lib/rexml/dtd/elementdecl.rb b/lib/rexml/dtd/elementdecl.rb index c4e620f389..119fd41a8f 100644 --- a/lib/rexml/dtd/elementdecl.rb +++ b/lib/rexml/dtd/elementdecl.rb @@ -1,17 +1,18 @@ +# frozen_string_literal: false require "rexml/child" module REXML - module DTD - class ElementDecl < Child - START = "<!ELEMENT" - START_RE = /^\s*#{START}/um - PATTERN_RE = /^\s*(#{START}.*?)>/um - PATTERN_RE = /^\s*#{START}\s+((?:[:\w_][-\.\w_]*:)?[-!\*\.\w_]*)(.*?)>/ - #\s*((((["']).*?\5)|[^\/'">]*)*?)(\/)?>/um, true) + module DTD + class ElementDecl < Child + START = "<!ELEMENT" + START_RE = /^\s*#{START}/um + # PATTERN_RE = /^\s*(#{START}.*?)>/um + PATTERN_RE = /^\s*#{START}\s+((?:[:\w][-\.\w]*:)?[-!\*\.\w]*)(.*?)>/ + #\s*((((["']).*?\5)|[^\/'">]*)*?)(\/)?>/um, true) - def initialize match - @name = match[1] - @rest = match[2] - end - end - end + def initialize match + @name = match[1] + @rest = match[2] + end + end + end end diff --git a/lib/rexml/dtd/entitydecl.rb b/lib/rexml/dtd/entitydecl.rb index a5f1520f2b..45707e2f42 100644 --- a/lib/rexml/dtd/entitydecl.rb +++ b/lib/rexml/dtd/entitydecl.rb @@ -1,56 +1,57 @@ +# frozen_string_literal: false require "rexml/child" module REXML - module DTD - class EntityDecl < Child - START = "<!ENTITY" - START_RE = /^\s*#{START}/um - PUBLIC = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+PUBLIC\s+((["']).*?\3)\s+((["']).*?\5)\s*>/um - SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um - PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um - PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um - # <!ENTITY name SYSTEM "..."> - # <!ENTITY name "..."> - def initialize src - super() - md = nil - if src.match( PUBLIC ) - md = src.match( PUBLIC, true ) - @middle = "PUBLIC" - @content = "#{md[2]} #{md[4]}" - elsif src.match( SYSTEM ) - md = src.match( SYSTEM, true ) - @middle = "SYSTEM" - @content = md[2] - elsif src.match( PLAIN ) - md = src.match( PLAIN, true ) - @middle = "" - @content = md[2] - elsif src.match( PERCENT ) - md = src.match( PERCENT, true ) - @middle = "" - @content = md[2] - end - raise ParseException.new("failed Entity match", src) if md.nil? - @name = md[1] - end + module DTD + class EntityDecl < Child + START = "<!ENTITY" + START_RE = /^\s*#{START}/um + PUBLIC = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+PUBLIC\s+((["']).*?\3)\s+((["']).*?\5)\s*>/um + SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um + PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um + PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um + # <!ENTITY name SYSTEM "..."> + # <!ENTITY name "..."> + def initialize src + super() + md = nil + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + @middle = "PUBLIC" + @content = "#{md[2]} #{md[4]}" + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + @middle = "SYSTEM" + @content = md[2] + elsif src.match( PLAIN ) + md = src.match( PLAIN, true ) + @middle = "" + @content = md[2] + elsif src.match( PERCENT ) + md = src.match( PERCENT, true ) + @middle = "" + @content = md[2] + end + raise ParseException.new("failed Entity match", src) if md.nil? + @name = md[1] + end - def to_s - rv = "<!ENTITY #@name " - rv << "#@middle " if @middle.size > 0 - rv << @content - rv - end + def to_s + rv = "<!ENTITY #@name " + rv << "#@middle " if @middle.size > 0 + rv << @content + rv + end - def write( output, indent ) + def write( output, indent ) indent( output, indent ) - output << to_s - end + output << to_s + end - def EntityDecl.parse_source source, listener - md = source.match( PATTERN_RE, true ) - thing = md[0].squeeze(" \t\n\r") - listener.send inspect.downcase, thing - end - end - end + def EntityDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze(" \t\n\r") + listener.send inspect.downcase, thing + end + end + end end diff --git a/lib/rexml/dtd/notationdecl.rb b/lib/rexml/dtd/notationdecl.rb index a47ff8f24b..cfdf0b9b74 100644 --- a/lib/rexml/dtd/notationdecl.rb +++ b/lib/rexml/dtd/notationdecl.rb @@ -1,39 +1,40 @@ +# frozen_string_literal: false require "rexml/child" module REXML - module DTD - class NotationDecl < Child - START = "<!NOTATION" - START_RE = /^\s*#{START}/um - PUBLIC = /^\s*#{START}\s+(\w[\w-]*)\s+(PUBLIC)\s+((["']).*?\4)\s*>/um - SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um - def initialize src - super() - if src.match( PUBLIC ) - md = src.match( PUBLIC, true ) - elsif src.match( SYSTEM ) - md = src.match( SYSTEM, true ) - else - raise ParseException.new( "error parsing notation: no matching pattern", src ) - end - @name = md[1] - @middle = md[2] - @rest = md[3] - end + module DTD + class NotationDecl < Child + START = "<!NOTATION" + START_RE = /^\s*#{START}/um + PUBLIC = /^\s*#{START}\s+(\w[\w-]*)\s+(PUBLIC)\s+((["']).*?\4)\s*>/um + SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um + def initialize src + super() + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + else + raise ParseException.new( "error parsing notation: no matching pattern", src ) + end + @name = md[1] + @middle = md[2] + @rest = md[3] + end - def to_s - "<!NOTATION #@name #@middle #@rest>" - end + def to_s + "<!NOTATION #@name #@middle #@rest>" + end - def write( output, indent ) + def write( output, indent ) indent( output, indent ) - output << to_s - end + output << to_s + end - def NotationDecl.parse_source source, listener - md = source.match( PATTERN_RE, true ) - thing = md[0].squeeze(" \t\n\r") - listener.send inspect.downcase, thing - end - end - end + def NotationDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze(" \t\n\r") + listener.send inspect.downcase, thing + end + end + end end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb index 3db87c6126..ac9b10872c 100644 --- a/lib/rexml/element.rb +++ b/lib/rexml/element.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require "rexml/parent" require "rexml/namespace" require "rexml/attribute" @@ -20,7 +21,7 @@ module REXML class Element < Parent include Namespace - UNDEFINED = "UNDEFINED"; # The default name + UNDEFINED = "UNDEFINED"; # The default name # Mechanisms for accessing attributes and child elements of this # element. @@ -30,18 +31,18 @@ module REXML attr_accessor :context # Constructor - # arg:: - # if not supplied, will be set to the default value. - # If a String, the name of this object will be set to the argument. - # If an Element, the object will be shallowly cloned; name, - # attributes, and namespaces will be copied. Children will +not+ be - # copied. - # parent:: - # if supplied, must be a Parent, and will be used as - # the parent of this object. + # arg:: + # if not supplied, will be set to the default value. + # If a String, the name of this object will be set to the argument. + # If an Element, the object will be shallowly cloned; name, + # attributes, and namespaces will be copied. Children will +not+ be + # copied. + # parent:: + # if supplied, must be a Parent, and will be used as + # the parent of this object. # context:: - # If supplied, must be a hash containing context items. Context items - # include: + # If supplied, must be a hash containing context items. Context items + # include: # * <tt>:respect_whitespace</tt> the value of this is :+all+ or an array of # strings being the names of the elements to respect # whitespace for. Defaults to :+all+. @@ -97,7 +98,7 @@ module REXML self.class.new self end - # Evaluates to the root node of the document that this element + # Evaluates to the root node of the document that this element # belongs to. If this element doesn't belong to a document, but does # belong to another Element, the parent's root will be returned, until the # earliest ancestor is found. @@ -137,8 +138,8 @@ module REXML # is the case if: # 1. Neither :+respect_whitespace+ nor :+compress_whitespace+ has any value # 2. The context has :+respect_whitespace+ set to :+all+ or - # an array containing the name of this element, and - # :+compress_whitespace+ isn't set to :+all+ or an array containing the + # an array containing the name of this element, and + # :+compress_whitespace+ isn't set to :+all+ or an array containing the # name of this element. # The evaluation is tested against +expanded_name+, and so is namespace # sensitive. @@ -162,7 +163,7 @@ module REXML @ignore_whitespace_nodes = false if @context if @context[:ignore_whitespace_nodes] - @ignore_whitespace_nodes = + @ignore_whitespace_nodes = (@context[:ignore_whitespace_nodes] == :all or @context[:ignore_whitespace_nodes].include? expanded_name) end @@ -206,13 +207,13 @@ module REXML return namespaces end - # Evalutas to the URI for a prefix, or the empty string if no such + # Evaluates to the URI for a prefix, or the empty string if no such # namespace is declared for this element. Evaluates recursively for # ancestors. Returns the default namespace, if there is one. - # prefix:: + # prefix:: # the prefix to search for. If not supplied, returns the default # namespace if one exists - # Returns:: + # Returns:: # the namespace URI as a String, or nil if no such namespace # exists. If the namespace is undefined, returns an empty string # doc = Document.new("<a xmlns='1' xmlns:y='2'><b/><c xmlns:z='3'/></a>") @@ -235,10 +236,10 @@ module REXML end # Adds a namespace to this element. - # prefix:: + # prefix:: # the prefix string, or the namespace URI if +uri+ is not # supplied - # uri:: + # uri:: # the namespace URI. May be nil, in which +prefix+ is used as # the URI # Evaluates to: this Element @@ -280,12 +281,12 @@ module REXML # Adds a child to this element, optionally setting attributes in # the element. - # element:: + # element:: # optional. If Element, the element is added. # Otherwise, a new Element is constructed with the argument (see # Element.initialize). - # attrs:: - # If supplied, must be a Hash containing String name,value + # attrs:: + # If supplied, must be a Hash containing String name,value # pairs, which will be used to set the attributes of the new Element. # Returns:: the Element that was added # el = doc.add_element 'my-tag' @@ -296,15 +297,15 @@ module REXML raise "First argument must be either an element name, or an Element object" if element.nil? el = @elements.add(element) attrs.each do |key, value| - el.attributes[key]=Attribute.new(key,value,self) - end if attrs.kind_of? Hash + el.attributes[key]=value + end if attrs.kind_of? Hash el end # Deletes a child element. - # element:: - # Must be an +Element+, +String+, or +Integer+. If Element, - # the element is removed. If String, the element is found (via XPath) + # element:: + # Must be an +Element+, +String+, or +Integer+. If Element, + # the element is removed. If String, the element is found (via XPath) # and removed. <em>This means that any parent can remove any # descendant.<em> If Integer, the Element indexed by that number will be # removed. @@ -327,14 +328,14 @@ module REXML # Iterates through the child elements, yielding for each Element that # has a particular attribute set. - # key:: + # key:: # the name of the attribute to search for - # value:: + # value:: # the value of the attribute - # max:: - # (optional) causes this method to return after yielding + # max:: + # (optional) causes this method to return after yielding # for this number of matching children - # name:: + # name:: # (optional) if supplied, this is an XPath that filters # the children to check. # @@ -348,7 +349,7 @@ module REXML # # Yields d # doc.root.each_element_with_attribute( 'id', '1', 0, 'd' ) {|e| p e} def each_element_with_attribute( key, value=nil, max=0, name=nil, &block ) # :yields: Element - each_with_something( proc {|child| + each_with_something( proc {|child| if value.nil? child.attributes[key] != nil else @@ -359,13 +360,13 @@ module REXML # Iterates through the children, yielding for each Element that # has a particular text set. - # text:: + # text:: # the text to search for. If nil, or not supplied, will iterate # over all +Element+ children that contain at least one +Text+ node. - # max:: + # max:: # (optional) causes this method to return after yielding # for this number of matching children - # name:: + # name:: # (optional) if supplied, this is an XPath that filters # the children to check. # @@ -379,7 +380,7 @@ module REXML # # Yields d # doc.each_element_with_text(nil, 0, 'd'){|e|p e} def each_element_with_text( text=nil, max=0, name=nil, &block ) # :yields: Element - each_with_something( proc {|child| + each_with_something( proc {|child| if text.nil? child.has_text? else @@ -408,7 +409,7 @@ module REXML # doc.root.elements['c'].next_element #-> nil def next_element element = next_sibling - element = element.next_sibling until element.nil? or element.kind_of? Element + element = element.next_sibling until element.nil? or element.kind_of? Element return element end @@ -477,7 +478,7 @@ module REXML # this method with a nil argument. In this case, the next Text # child becomes the first Text child. In no case is the order of # any siblings disturbed. - # text:: + # text:: # If a String, a new Text child is created and added to # this Element as the first Text child. If Text, the text is set # as the first Child element. If nil, then any existing first Text @@ -492,7 +493,7 @@ module REXML def text=( text ) if text.kind_of? String text = Text.new( text, whitespace(), nil, raw() ) - elsif text and !text.kind_of? Text + elsif !text.nil? and !text.kind_of? Text text = Text.new( text.to_s, whitespace(), nil, raw() ) end old_text = get_text @@ -520,7 +521,7 @@ module REXML # Note that at the end of this example, the branch has <b>3</b> nodes; the 'e' # element and <b>2</b> Text node children. def add_text( text ) - if text.kind_of? String + if text.kind_of? String if @children[-1].kind_of? Text @children[-1] << text return @@ -550,11 +551,51 @@ module REXML # Attributes # ################################################# + # Fetches an attribute value or a child. + # + # If String or Symbol is specified, it's treated as attribute + # name. Attribute value as String or +nil+ is returned. This case + # is shortcut of +attributes[name]+. + # + # If Integer is specified, it's treated as the index of + # child. It returns Nth child. + # + # doc = REXML::Document.new("<a attr='1'><b/><c/></a>") + # doc.root["attr"] # => "1" + # doc.root.attributes["attr"] # => "1" + # doc.root[1] # => <c/> + def [](name_or_index) + case name_or_index + when String + attributes[name_or_index] + when Symbol + attributes[name_or_index.to_s] + else + super + end + end + def attribute( name, namespace=nil ) prefix = nil - prefix = namespaces.index(namespace) if namespace + if namespaces.respond_to? :key + prefix = namespaces.key(namespace) if namespace + else + prefix = namespaces.index(namespace) if namespace + end prefix = nil if prefix == 'xmlns' - attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + + ret_val = + attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" ) + + return ret_val unless ret_val.nil? + return nil if prefix.nil? + + # now check that prefix'es namespace is not the same as the + # default namespace + return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] ) + + attributes.get_attribute( name ) + end # Evaluates to +true+ if this element has any attributes set, false @@ -570,7 +611,7 @@ module REXML # the attribute is added to the list of Element attributes. If String, # the argument is used as the name of the new attribute, and the value # parameter must be supplied. - # value:: + # value:: # Required if +key+ is a String, and ignored if the first argument is # an Attribute. This is a String, and is used as the value # of the new Attribute. This should be the unnormalized value of the @@ -605,7 +646,7 @@ module REXML # either an Attribute or a String. In either case, the # attribute is found by matching the attribute name to the argument, # and then removed. If no attribute is found, no action is taken. - # Returns:: + # Returns:: # the attribute removed, or nil if this Element did not contain # a matching attribute # e = Element.new('E') @@ -622,7 +663,7 @@ module REXML # Other Utilities # ################################################# - # Get an array of all CData children. + # Get an array of all CData children. # IMMUTABLE def cdatas find_all { |child| child.kind_of? CData }.freeze @@ -651,7 +692,7 @@ module REXML # # Writes out this element, and recursively, all children. # output:: - # output an object which supports '<< string'; this is where the + # output an object which supports '<< string'; this is where the # document will be written. # indent:: # An integer. If -1, no indenting will be used; otherwise, the @@ -662,19 +703,17 @@ module REXML # pretty-printed in such a way that the added whitespace does not affect # the parse tree of the document # ie_hack:: - # Internet Explorer is the worst piece of crap to have ever been - # written, with the possible exception of Windows itself. Since IE is - # unable to parse proper XML, we have to provide a hack to generate XML - # that IE's limited abilities can handle. This hack inserts a space - # before the /> on empty tags. Defaults to false + # This hack inserts a space before the /> on empty tags to address + # a limitation of Internet Explorer. Defaults to false # # out = '' # doc.write( out ) #-> doc is written to the string 'out' # doc.write( $stdout ) #-> doc written to the console - def write(writer=$stdout, indent=-1, transitive=false, ie_hack=false) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") + def write(output=$stdout, indent=-1, transitive=false, ie_hack=false) + Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 if transitive + require "rexml/formatters/transitive" REXML::Formatters::Transitive.new( indent, ie_hack ) else REXML::Formatters::Pretty.new( indent, ie_hack ) @@ -690,8 +729,8 @@ module REXML def __to_xpath_helper node rv = node.expanded_name.clone if node.parent - results = node.parent.find_all {|n| - n.kind_of?(REXML::Element) and n.expanded_name == node.expanded_name + results = node.parent.find_all {|n| + n.kind_of?(REXML::Element) and n.expanded_name == node.expanded_name } if results.length > 1 idx = results.index( node ) @@ -704,7 +743,6 @@ module REXML # A private helper method def each_with_something( test, max=0, name=nil ) num = 0 - child=nil @elements.each( name ){ |child| yield child if test.call(child) and num += 1 return if max>0 and num == max @@ -718,7 +756,7 @@ module REXML # A class which provides filtering of children for Elements, and # XPath search support. You are expected to only encounter this class as - # the <tt>element.elements</tt> object. Therefore, you are + # the <tt>element.elements</tt> object. Therefore, you are # _not_ expected to instantiate this yourself. class Elements include Enumerable @@ -730,7 +768,7 @@ module REXML # Fetches a child element. Filters only Element children, regardless of # the XPath match. - # index:: + # index:: # the search parameter. This is either an Integer, which # will be used to find the index'th child Element, or an XPath, # which will be used to search for the Element. <em>Because @@ -740,7 +778,7 @@ module REXML # child element is at index 1, not 0, and the +n+th element is at index # +n+, not <tt>n-1</tt>. This is because XPath indexes element children # starting from 1, not 0, and the indexes should be the same. - # name:: + # name:: # optional, and only used in the first argument is an # Integer. In that case, the index'th child Element that has the # supplied name will be returned. Note again that the indexes start at 1. @@ -754,16 +792,15 @@ module REXML raise "index (#{index}) must be >= 1" if index < 1 name = literalize(name) if name num = 0 - child = nil @element.find { |child| child.kind_of? Element and - (name.nil? ? true : child.has_name?( name )) and + (name.nil? ? true : child.has_name?( name )) and (num += 1) == index } else return XPath::first( @element, index ) - #{ |element| - # return element if element.kind_of? Element + #{ |element| + # return element if element.kind_of? Element #} #return nil end @@ -772,7 +809,7 @@ module REXML # Sets an element, replacing any previous matching element. If no # existing element is found ,the element is added. # index:: Used to find a matching element to replace. See [](). - # element:: + # element:: # The element to replace the existing element with # the previous element # Returns:: nil if no previous element was found. @@ -797,12 +834,12 @@ module REXML @element.find{ |child| child.kind_of? Element}.nil? end - # Returns the index of the supplied child (starting at 1), or -1 if + # Returns the index of the supplied child (starting at 1), or -1 if # the element is not a child # element:: an +Element+ child def index element rv = 0 - found = @element.find do |child| + found = @element.find do |child| child.kind_of? Element and (rv += 1) and child == element @@ -812,7 +849,7 @@ module REXML end # Deletes a child Element - # element:: + # element:: # Either an Element, which is removed directly; an # xpath, where the first matching child is removed; or an Integer, # where the n'th Element is removed. @@ -839,7 +876,7 @@ module REXML # deleted = doc.elements.delete_all 'a/c' #-> [<c/>, <c/>, <c/>, <c/>] def delete_all( xpath ) rv = [] - XPath::each( @element, xpath) {|element| + XPath::each( @element, xpath) {|element| rv << element if element.kind_of? Element } rv.each do |element| @@ -850,7 +887,7 @@ module REXML end # Adds an element - # element:: + # element:: # if supplied, is either an Element, String, or # Source (see Element.initialize). If not supplied or nil, a # new, default Element will be constructed @@ -859,7 +896,6 @@ module REXML # a.elements.add(Element.new('b')) #-> <a><b/></a> # a.elements.add('c') #-> <a><b/><c/></a> def add element=nil - rv = nil if element.nil? Element.new("", self, @element.context) elsif not element.kind_of?(Element) @@ -875,31 +911,31 @@ module REXML # Iterates through all of the child Elements, optionally filtering # them by a given XPath - # xpath:: - # optional. If supplied, this is a String XPath, and is used to + # xpath:: + # optional. If supplied, this is a String XPath, and is used to # filter the children, so that only matching children are yielded. Note # that XPaths are automatically filtered for Elements, so that # non-Element children will not be yielded # doc = Document.new '<a><b/><c/><d/>sean<b/><c/><d/></a>' - # doc.root.each {|e|p e} #-> Yields b, c, d, b, c, d elements - # doc.root.each('b') {|e|p e} #-> Yields b, b elements - # doc.root.each('child::node()') {|e|p e} + # doc.root.elements.each {|e|p e} #-> Yields b, c, d, b, c, d elements + # doc.root.elements.each('b') {|e|p e} #-> Yields b, b elements + # doc.root.elements.each('child::node()') {|e|p e} # #-> Yields <b/>, <c/>, <d/>, <b/>, <c/>, <d/> # XPath.each(doc.root, 'child::node()', &block) # #-> Yields <b/>, <c/>, <d/>, sean, <b/>, <c/>, <d/> - def each( xpath=nil, &block) + def each( xpath=nil ) XPath::each( @element, xpath ) {|e| yield e if e.kind_of? Element } end - def collect( xpath=nil, &block ) + def collect( xpath=nil ) collection = [] - XPath::each( @element, xpath ) {|e| - collection << yield(e) if e.kind_of?(Element) + XPath::each( @element, xpath ) {|e| + collection << yield(e) if e.kind_of?(Element) } collection end - def inject( xpath=nil, initial=nil, &block ) + def inject( xpath=nil, initial=nil ) first = true XPath::each( @element, xpath ) {|e| if (e.kind_of? Element) @@ -929,7 +965,7 @@ module REXML # supplied XPath matches non-Element children. # doc = Document.new '<a>sean<b/>elliott<c/></a>' # doc.root.elements.to_a #-> [ <b/>, <c/> ] - # doc.root.elements.to_a("child::node()") #-> [ <b/>, <c/> ] + # doc.root.elements.to_a("child::node()") #-> [ <b/>, <c/> ] # XPath.match(doc.root, "child::node()") #-> [ sean, <b/>, elliott, <c/> ] def to_a( xpath=nil ) rv = XPath.match( @element, xpath ) @@ -949,7 +985,7 @@ module REXML # ATTRIBUTES # ######################################################################## - # A class that defines the set of Attributes of an Element and provides + # A class that defines the set of Attributes of an Element and provides # operations for accessing elements in that set. class Attributes < Hash # Constructor @@ -961,11 +997,11 @@ module REXML # Fetches an attribute value. If you want to get the Attribute itself, # use get_attribute() # name:: an XPath attribute name. Namespaces are relevant here. - # Returns:: + # Returns:: # the String value of the matching attribute, or +nil+ if no # matching attribute was found. This is the unnormalized value # (with entities expanded). - # + # # doc = Document.new "<a foo:att='1' bar:att='2' att='<'/>" # doc.root.attributes['att'] #-> '<' # doc.root.attributes['bar:att'] #-> '2' @@ -976,7 +1012,7 @@ module REXML end def to_a - values.flatten + enum_for(:each_attribute).to_a end # Returns the number of attributes the owning Element contains. @@ -991,7 +1027,7 @@ module REXML # Iterates over the attributes of an Element. Yields actual Attribute # nodes, not String values. - # + # # doc = Document.new '<a x="1" y="2"/>' # doc.root.attributes.each_attribute {|attr| # p attr.expanded_name+" => "+attr.value @@ -1013,12 +1049,12 @@ module REXML # doc.root.attributes.each {|name, value| p name+" => "+value } def each each_attribute do |attr| - yield attr.expanded_name, attr.value + yield [attr.expanded_name, attr.value] end end # Fetches an attribute - # name:: + # name:: # the name by which to search for the attribute. Can be a # <tt>prefix:name</tt> namespace name. # Returns:: The first matching attribute, or nil if there was none. This @@ -1062,22 +1098,22 @@ module REXML # Sets an attribute, overwriting any existing attribute value by the # same name. Namespace is significant. # name:: the name of the attribute - # value:: + # value:: # (optional) If supplied, the value of the attribute. If # nil, any existing matching attribute is deleted. - # Returns:: + # Returns:: # Owning element # doc = Document.new "<a x:foo='1' foo='3'/>" # doc.root.attributes['y:foo'] = '2' # doc.root.attributes['foo'] = '4' # doc.root.attributes['x:foo'] = nil def []=( name, value ) - if value.nil? # Delete the named attribute + if value.nil? # Delete the named attribute attr = get_attribute(name) delete attr return end - element_document = @element.document + unless value.kind_of? Attribute if @element.document and @element.document.doctype value = Text::normalize( value, @element.document.doctype ) @@ -1094,23 +1130,23 @@ module REXML old_attr[value.prefix] = value elsif old_attr.prefix != value.prefix # Check for conflicting namespaces - raise ParseException.new( + raise ParseException.new( "Namespace conflict in adding attribute \"#{value.name}\": "+ "Prefix \"#{old_attr.prefix}\" = "+ "\"#{@element.namespace(old_attr.prefix)}\" and prefix "+ - "\"#{value.prefix}\" = \"#{@element.namespace(value.prefix)}\"") if + "\"#{value.prefix}\" = \"#{@element.namespace(value.prefix)}\"") if value.prefix != "xmlns" and old_attr.prefix != "xmlns" and - @element.namespace( old_attr.prefix ) == + @element.namespace( old_attr.prefix ) == @element.namespace( value.prefix ) - store value.name, { old_attr.prefix => old_attr, - value.prefix => value } + store value.name, { old_attr.prefix => old_attr, + value.prefix => value } else store value.name, value end return @element end - # Returns an array of Strings containing all of the prefixes declared + # Returns an array of Strings containing all of the prefixes declared # by this set of # attributes. The array does not include the default # namespace declaration, if one exists. # doc = Document.new("<a xmlns='foo' xmlns:x='bar' xmlns:y='twee' "+ @@ -1149,7 +1185,7 @@ module REXML end # Removes an attribute - # attribute:: + # attribute:: # either a String, which is the name of the attribute to remove -- # namespaces are significant here -- or the attribute to remove. # Returns:: the owning element @@ -1170,9 +1206,8 @@ module REXML prefix = '' unless prefix end old = fetch(name, nil) - attr = nil if old.kind_of? Hash # the supplied attribute is one of many - attr = old.delete(prefix) + old.delete(prefix) if old.size == 1 repl = nil old.each_value{|v| repl = v} @@ -1181,8 +1216,7 @@ module REXML elsif old.nil? return @element else # the supplied attribute is a top-level one - attr = old - res = super(name) + super(name) end @element end @@ -1197,12 +1231,12 @@ module REXML alias :<< :add # Deletes all attributes matching a name. Namespaces are significant. - # name:: + # name:: # A String; all attributes that match this path will be removed # Returns:: an Array of the Attributes that were removed def delete_all( name ) rv = [] - each_attribute { |attribute| + each_attribute { |attribute| rv << attribute if attribute.expanded_name == name } rv.each{ |attr| attr.remove } @@ -1212,16 +1246,20 @@ module REXML # The +get_attribute_ns+ method retrieves a method by its namespace # and name. Thus it is possible to reliably identify an attribute # even if an XML processor has changed the prefix. - # + # # Method contributed by Henrik Martensson def get_attribute_ns(namespace, name) + result = nil each_attribute() { |attribute| if name == attribute.name && - namespace == attribute.namespace() - return attribute + namespace == attribute.namespace() && + ( !namespace.empty? || !attribute.fully_expanded_name.index(':') ) + # foo will match xmlns:foo, but only if foo isn't also an attribute + result = attribute if !result or !namespace.empty? or + !attribute.fully_expanded_name.index(':') end } - nil + result end end end diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb index a01763be99..da2d70d6c9 100644 --- a/lib/rexml/encoding.rb +++ b/lib/rexml/encoding.rb @@ -1,71 +1,51 @@ -# -*- mode: ruby; ruby-indent-level: 2; indent-tabs-mode: t; tab-width: 2 -*- vim: sw=2 ts=2 +# coding: US-ASCII +# frozen_string_literal: false module REXML module Encoding - @encoding_methods = {} - def self.register(enc, &block) - @encoding_methods[enc] = block - end - def self.apply(obj, enc) - @encoding_methods[enc][obj] - end - def self.encoding_method(enc) - @encoding_methods[enc] - end - - # Native, default format is UTF-8, so it is declared here rather than in - # an encodings/ definition. - UTF_8 = 'UTF-8' - UTF_16 = 'UTF-16' - UNILE = 'UNILE' - # ID ---> Encoding name attr_reader :encoding - def encoding=( enc ) - old_verbosity = $VERBOSE - begin - $VERBOSE = false - enc = enc.nil? ? nil : enc.upcase - return false if defined? @encoding and enc == @encoding - if enc and enc != UTF_8 - @encoding = enc - raise ArgumentError, "Bad encoding name #@encoding" unless @encoding =~ /^[\w-]+$/ - @encoding.untaint - begin - require 'rexml/encodings/ICONV.rb' - Encoding.apply(self, "ICONV") - rescue LoadError, Exception - begin - enc_file = File.join( "rexml", "encodings", "#@encoding.rb" ) - require enc_file - Encoding.apply(self, @encoding) - rescue LoadError => err - puts err.message - raise ArgumentError, "No decoder found for encoding #@encoding. Please install iconv." - end - end - else - @encoding = UTF_8 - require 'rexml/encodings/UTF-8.rb' - Encoding.apply(self, @encoding) + def encoding=(encoding) + encoding = encoding.name if encoding.is_a?(Encoding) + if encoding.is_a?(String) + original_encoding = encoding + encoding = find_encoding(encoding) + unless encoding + raise ArgumentError, "Bad encoding name #{original_encoding}" end - ensure - $VERBOSE = old_verbosity + end + return false if defined?(@encoding) and encoding == @encoding + if encoding + @encoding = encoding.upcase + else + @encoding = 'UTF-8' end true end - def check_encoding str - # We have to recognize UTF-16, LSB UTF-16, and UTF-8 - if str[0] == 0xfe && str[1] == 0xff - str[0,2] = "" - return UTF_16 - elsif str[0] == 0xff && str[1] == 0xfe - str[0,2] = "" - return UNILE + def encode(string) + string.encode(@encoding) + end + + def decode(string) + string.encode(::Encoding::UTF_8, @encoding) + end + + private + def find_encoding(name) + case name + when /\Ashift-jis\z/i + return "SHIFT_JIS" + when /\ACP-(\d+)\z/ + name = "CP#{$1}" + when /\AUTF-8\z/i + return name + end + begin + ::Encoding::Converter.search_convpath(name, 'UTF-8') + rescue ::Encoding::ConverterNotFoundError + return nil end - str =~ /^\s*<\?xml\s+version\s*=\s*(['"]).*?\1\s+encoding\s*=\s*(["'])(.*?)\2/um - return $3.upcase if $3 - return UTF_8 + name end end end diff --git a/lib/rexml/encodings/CP-1252.rb b/lib/rexml/encodings/CP-1252.rb deleted file mode 100644 index 8675f9ff98..0000000000 --- a/lib/rexml/encodings/CP-1252.rb +++ /dev/null @@ -1,103 +0,0 @@ -# -# This class was contributed by Mikko Tiihonen mikko DOT tiihonen AT hut DOT fi -# -module REXML - module Encoding - register( "CP-1252" ) do |o| - class << o - alias encode encode_cp1252 - alias decode decode_cp1252 - end - end - - # Convert from UTF-8 - def encode_cp1252(content) - array_utf8 = content.unpack('U*') - array_enc = [] - array_utf8.each do |num| - case num - # shortcut first bunch basic characters - when 0..0xFF; array_enc << num - # characters added compared to iso-8859-1 - when 0x20AC; array_enc << 0x80 # 0xe2 0x82 0xac - when 0x201A; array_enc << 0x82 # 0xe2 0x82 0x9a - when 0x0192; array_enc << 0x83 # 0xc6 0x92 - when 0x201E; array_enc << 0x84 # 0xe2 0x82 0x9e - when 0x2026; array_enc << 0x85 # 0xe2 0x80 0xa6 - when 0x2020; array_enc << 0x86 # 0xe2 0x80 0xa0 - when 0x2021; array_enc << 0x87 # 0xe2 0x80 0xa1 - when 0x02C6; array_enc << 0x88 # 0xcb 0x86 - when 0x2030; array_enc << 0x89 # 0xe2 0x80 0xb0 - when 0x0160; array_enc << 0x8A # 0xc5 0xa0 - when 0x2039; array_enc << 0x8B # 0xe2 0x80 0xb9 - when 0x0152; array_enc << 0x8C # 0xc5 0x92 - when 0x017D; array_enc << 0x8E # 0xc5 0xbd - when 0x2018; array_enc << 0x91 # 0xe2 0x80 0x98 - when 0x2019; array_enc << 0x92 # 0xe2 0x80 0x99 - when 0x201C; array_enc << 0x93 # 0xe2 0x80 0x9c - when 0x201D; array_enc << 0x94 # 0xe2 0x80 0x9d - when 0x2022; array_enc << 0x95 # 0xe2 0x80 0xa2 - when 0x2013; array_enc << 0x96 # 0xe2 0x80 0x93 - when 0x2014; array_enc << 0x97 # 0xe2 0x80 0x94 - when 0x02DC; array_enc << 0x98 # 0xcb 0x9c - when 0x2122; array_enc << 0x99 # 0xe2 0x84 0xa2 - when 0x0161; array_enc << 0x9A # 0xc5 0xa1 - when 0x203A; array_enc << 0x9B # 0xe2 0x80 0xba - when 0x0152; array_enc << 0x9C # 0xc5 0x93 - when 0x017E; array_enc << 0x9E # 0xc5 0xbe - when 0x0178; array_enc << 0x9F # 0xc5 0xb8 - else - # all remaining basic characters can be used directly - if num <= 0xFF - array_enc << num - else - # Numeric entity (&#nnnn;); shard by Stefan Scholl - array_enc.concat "&\##{num};".unpack('C*') - end - end - end - array_enc.pack('C*') - end - - # Convert to UTF-8 - def decode_cp1252(str) - array_latin9 = str.unpack('C*') - array_enc = [] - array_latin9.each do |num| - case num - # characters that added compared to iso-8859-1 - when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac - when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a - when 0x83; array_enc << 0x0192 # 0xc6 0x92 - when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e - when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6 - when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0 - when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1 - when 0x88; array_enc << 0x02C6 # 0xcb 0x86 - when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0 - when 0x8A; array_enc << 0x0160 # 0xc5 0xa0 - when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9 - when 0x8C; array_enc << 0x0152 # 0xc5 0x92 - when 0x8E; array_enc << 0x017D # 0xc5 0xbd - when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98 - when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99 - when 0x93; array_enc << 0x201C # 0xe2 0x80 0x9c - when 0x94; array_enc << 0x201D # 0xe2 0x80 0x9d - when 0x95; array_enc << 0x2022 # 0xe2 0x80 0xa2 - when 0x96; array_enc << 0x2013 # 0xe2 0x80 0x93 - when 0x97; array_enc << 0x2014 # 0xe2 0x80 0x94 - when 0x98; array_enc << 0x02DC # 0xcb 0x9c - when 0x99; array_enc << 0x2122 # 0xe2 0x84 0xa2 - when 0x9A; array_enc << 0x0161 # 0xc5 0xa1 - when 0x9B; array_enc << 0x203A # 0xe2 0x80 0xba - when 0x9C; array_enc << 0x0152 # 0xc5 0x93 - when 0x9E; array_enc << 0x017E # 0xc5 0xbe - when 0x9F; array_enc << 0x0178 # 0xc5 0xb8 - else - array_enc << num - end - end - array_enc.pack('U*') - end - end -end diff --git a/lib/rexml/encodings/EUC-JP.rb b/lib/rexml/encodings/EUC-JP.rb deleted file mode 100644 index db37b6bf0d..0000000000 --- a/lib/rexml/encodings/EUC-JP.rb +++ /dev/null @@ -1,35 +0,0 @@ -module REXML - module Encoding - begin - require 'uconv' - - def decode_eucjp(str) - Uconv::euctou8(str) - end - - def encode_eucjp content - Uconv::u8toeuc(content) - end - rescue LoadError - require 'nkf' - - EUCTOU8 = '-Ewm0' - U8TOEUC = '-Wem0' - - def decode_eucjp(str) - NKF.nkf(EUCTOU8, str) - end - - def encode_eucjp content - NKF.nkf(U8TOEUC, content) - end - end - - register("EUC-JP") do |obj| - class << obj - alias decode decode_eucjp - alias encode encode_eucjp - end - end - end -end diff --git a/lib/rexml/encodings/ICONV.rb b/lib/rexml/encodings/ICONV.rb deleted file mode 100644 index 172fba7cd1..0000000000 --- a/lib/rexml/encodings/ICONV.rb +++ /dev/null @@ -1,22 +0,0 @@ -require "iconv" -raise LoadError unless defined? Iconv - -module REXML - module Encoding - def decode_iconv(str) - Iconv.conv(UTF_8, @encoding, str) - end - - def encode_iconv(content) - Iconv.conv(@encoding, UTF_8, content) - end - - register("ICONV") do |obj| - Iconv.conv(UTF_8, obj.encoding, nil) - class << obj - alias decode decode_iconv - alias encode encode_iconv - end - end - end -end diff --git a/lib/rexml/encodings/ISO-8859-1.rb b/lib/rexml/encodings/ISO-8859-1.rb deleted file mode 100644 index 2873d13bf0..0000000000 --- a/lib/rexml/encodings/ISO-8859-1.rb +++ /dev/null @@ -1,7 +0,0 @@ -require 'rexml/encodings/US-ASCII' - -module REXML - module Encoding - register("ISO-8859-1", &encoding_method("US-ASCII")) - end -end diff --git a/lib/rexml/encodings/ISO-8859-15.rb b/lib/rexml/encodings/ISO-8859-15.rb deleted file mode 100644 index 8dea0d38a4..0000000000 --- a/lib/rexml/encodings/ISO-8859-15.rb +++ /dev/null @@ -1,72 +0,0 @@ -# -# This class was contributed by Mikko Tiihonen mikko DOT tiihonen AT hut DOT fi -# -module REXML - module Encoding - register("ISO-8859-15") do |o| - alias encode to_iso_8859_15 - alias decode from_iso_8859_15 - end - - # Convert from UTF-8 - def to_iso_8859_15(content) - array_utf8 = content.unpack('U*') - array_enc = [] - array_utf8.each do |num| - case num - # shortcut first bunch basic characters - when 0..0xA3; array_enc << num - # characters removed compared to iso-8859-1 - when 0xA4; array_enc << '¤' - when 0xA6; array_enc << '¦' - when 0xA8; array_enc << '¨' - when 0xB4; array_enc << '´' - when 0xB8; array_enc << '¸' - when 0xBC; array_enc << '¼' - when 0xBD; array_enc << '½' - when 0xBE; array_enc << '¾' - # characters added compared to iso-8859-1 - when 0x20AC; array_enc << 0xA4 # 0xe2 0x82 0xac - when 0x0160; array_enc << 0xA6 # 0xc5 0xa0 - when 0x0161; array_enc << 0xA8 # 0xc5 0xa1 - when 0x017D; array_enc << 0xB4 # 0xc5 0xbd - when 0x017E; array_enc << 0xB8 # 0xc5 0xbe - when 0x0152; array_enc << 0xBC # 0xc5 0x92 - when 0x0153; array_enc << 0xBD # 0xc5 0x93 - when 0x0178; array_enc << 0xBE # 0xc5 0xb8 - else - # all remaining basic characters can be used directly - if num <= 0xFF - array_enc << num - else - # Numeric entity (&#nnnn;); shard by Stefan Scholl - array_enc.concat "&\##{num};".unpack('C*') - end - end - end - array_enc.pack('C*') - end - - # Convert to UTF-8 - def from_iso_8859_15(str) - array_latin9 = str.unpack('C*') - array_enc = [] - array_latin9.each do |num| - case num - # characters that differ compared to iso-8859-1 - when 0xA4; array_enc << 0x20AC - when 0xA6; array_enc << 0x0160 - when 0xA8; array_enc << 0x0161 - when 0xB4; array_enc << 0x017D - when 0xB8; array_enc << 0x017E - when 0xBC; array_enc << 0x0152 - when 0xBD; array_enc << 0x0153 - when 0xBE; array_enc << 0x0178 - else - array_enc << num - end - end - array_enc.pack('U*') - end - end -end diff --git a/lib/rexml/encodings/SHIFT-JIS.rb b/lib/rexml/encodings/SHIFT-JIS.rb deleted file mode 100644 index 9e0f4af20e..0000000000 --- a/lib/rexml/encodings/SHIFT-JIS.rb +++ /dev/null @@ -1,37 +0,0 @@ -module REXML - module Encoding - begin - require 'uconv' - - def decode_sjis content - Uconv::sjistou8(content) - end - - def encode_sjis(str) - Uconv::u8tosjis(str) - end - rescue LoadError - require 'nkf' - - SJISTOU8 = '-Swm0x' - U8TOSJIS = '-Wsm0x' - - def decode_sjis(str) - NKF.nkf(SJISTOU8, str) - end - - def encode_sjis content - NKF.nkf(U8TOSJIS, content) - end - end - - b = proc do |obj| - class << obj - alias decode decode_sjis - alias encode encode_sjis - end - end - register("SHIFT-JIS", &b) - register("SHIFT_JIS", &b) - end -end diff --git a/lib/rexml/encodings/SHIFT_JIS.rb b/lib/rexml/encodings/SHIFT_JIS.rb deleted file mode 100644 index e355704a7c..0000000000 --- a/lib/rexml/encodings/SHIFT_JIS.rb +++ /dev/null @@ -1 +0,0 @@ -require 'rexml/encodings/SHIFT-JIS' diff --git a/lib/rexml/encodings/UNILE.rb b/lib/rexml/encodings/UNILE.rb deleted file mode 100644 index d054140c40..0000000000 --- a/lib/rexml/encodings/UNILE.rb +++ /dev/null @@ -1,34 +0,0 @@ -module REXML - module Encoding - def encode_unile content - array_utf8 = content.unpack("U*") - array_enc = [] - array_utf8.each do |num| - if ((num>>16) > 0) - array_enc << ?? - array_enc << 0 - else - array_enc << (num & 0xFF) - array_enc << (num >> 8) - end - end - array_enc.pack('C*') - end - - def decode_unile(str) - array_enc=str.unpack('C*') - array_utf8 = [] - 0.step(array_enc.size-1, 2){|i| - array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100) - } - array_utf8.pack('U*') - end - - register(UNILE) do |obj| - class << obj - alias decode decode_unile - alias encode encode_unile - end - end - end -end diff --git a/lib/rexml/encodings/US-ASCII.rb b/lib/rexml/encodings/US-ASCII.rb deleted file mode 100644 index fb4c217074..0000000000 --- a/lib/rexml/encodings/US-ASCII.rb +++ /dev/null @@ -1,30 +0,0 @@ -module REXML - module Encoding - # Convert from UTF-8 - def encode_ascii content - array_utf8 = content.unpack('U*') - array_enc = [] - array_utf8.each do |num| - if num <= 0x7F - array_enc << num - else - # Numeric entity (&#nnnn;); shard by Stefan Scholl - array_enc.concat "&\##{num};".unpack('C*') - end - end - array_enc.pack('C*') - end - - # Convert to UTF-8 - def decode_ascii(str) - str.unpack('C*').pack('U*') - end - - register("US-ASCII") do |obj| - class << obj - alias decode decode_ascii - alias encode encode_ascii - end - end - end -end diff --git a/lib/rexml/encodings/UTF-16.rb b/lib/rexml/encodings/UTF-16.rb deleted file mode 100644 index 007c493d9c..0000000000 --- a/lib/rexml/encodings/UTF-16.rb +++ /dev/null @@ -1,35 +0,0 @@ -module REXML - module Encoding - def encode_utf16 content - array_utf8 = content.unpack("U*") - array_enc = [] - array_utf8.each do |num| - if ((num>>16) > 0) - array_enc << 0 - array_enc << ?? - else - array_enc << (num >> 8) - array_enc << (num & 0xFF) - end - end - array_enc.pack('C*') - end - - def decode_utf16(str) - str = str[2..-1] if /^\376\377/n =~ str - array_enc=str.unpack('C*') - array_utf8 = [] - 0.step(array_enc.size-1, 2){|i| - array_utf8 << (array_enc.at(i+1) + array_enc.at(i)*0x100) - } - array_utf8.pack('U*') - end - - register(UTF_16) do |obj| - class << obj - alias decode decode_utf16 - alias encode encode_utf16 - end - end - end -end diff --git a/lib/rexml/encodings/UTF-8.rb b/lib/rexml/encodings/UTF-8.rb deleted file mode 100644 index bb08f44100..0000000000 --- a/lib/rexml/encodings/UTF-8.rb +++ /dev/null @@ -1,18 +0,0 @@ -module REXML - module Encoding - def encode_utf8 content - content - end - - def decode_utf8(str) - str - end - - register(UTF_8) do |obj| - class << obj - alias decode decode_utf8 - alias encode encode_utf8 - end - end - end -end diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index ff2d45f39b..97c7b6b42f 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -1,165 +1,171 @@ +# frozen_string_literal: false require 'rexml/child' require 'rexml/source' require 'rexml/xmltokens' module REXML - # God, I hate DTDs. I really do. Why this idiot standard still - # plagues us is beyond me. - class Entity < Child - include XMLTokens - PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" - SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} - PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} - EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" - NDATADECL = "\\s+NDATA\\s+#{NAME}" - PEREFERENCE = "%#{NAME};" - ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} - PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" - ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" - PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" - GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" - ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + class Entity < Child + include XMLTokens + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>" + GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um - attr_reader :name, :external, :ref, :ndata, :pubid + attr_reader :name, :external, :ref, :ndata, :pubid - # Create a new entity. Simple entities can be constructed by passing a - # name, value to the constructor; this creates a generic, plain entity - # reference. For anything more complicated, you have to pass a Source to - # the constructor with the entity definiton, or use the accessor methods. - # +WARNING+: There is no validation of entity state except when the entity - # is read from a stream. If you start poking around with the accessors, - # you can easily create a non-conformant Entity. The best thing to do is - # dump the stupid DTDs and use XMLSchema instead. - # - # e = Entity.new( 'amp', '&' ) - def initialize stream, value=nil, parent=nil, reference=false - super(parent) - @ndata = @pubid = @value = @external = nil - if stream.kind_of? Array - @name = stream[1] - if stream[-1] == '%' - @reference = true - stream.pop - else - @reference = false - end - if stream[2] =~ /SYSTEM|PUBLIC/ - @external = stream[2] - if @external == 'SYSTEM' - @ref = stream[3] - @ndata = stream[4] if stream.size == 5 - else - @pubid = stream[3] - @ref = stream[4] - end - else - @value = stream[2] - end - else - @reference = reference - @external = nil - @name = stream - @value = value - end - end + # Create a new entity. Simple entities can be constructed by passing a + # name, value to the constructor; this creates a generic, plain entity + # reference. For anything more complicated, you have to pass a Source to + # the constructor with the entity definition, or use the accessor methods. + # +WARNING+: There is no validation of entity state except when the entity + # is read from a stream. If you start poking around with the accessors, + # you can easily create a non-conformant Entity. + # + # e = Entity.new( 'amp', '&' ) + def initialize stream, value=nil, parent=nil, reference=false + super(parent) + @ndata = @pubid = @value = @external = nil + if stream.kind_of? Array + @name = stream[1] + if stream[-1] == '%' + @reference = true + stream.pop + else + @reference = false + end + if stream[2] =~ /SYSTEM|PUBLIC/ + @external = stream[2] + if @external == 'SYSTEM' + @ref = stream[3] + @ndata = stream[4] if stream.size == 5 + else + @pubid = stream[3] + @ref = stream[4] + end + else + @value = stream[2] + end + else + @reference = reference + @external = nil + @name = stream + @value = value + end + end - # Evaluates whether the given string matchs an entity definition, - # returning true if so, and false otherwise. - def Entity::matches? string - (ENTITYDECL =~ string) == 0 - end + # Evaluates whether the given string matches an entity definition, + # returning true if so, and false otherwise. + def Entity::matches? string + (ENTITYDECL =~ string) == 0 + end - # Evaluates to the unnormalized value of this entity; that is, replacing - # all entities -- both %ent; and &ent; entities. This differs from - # +value()+ in that +value+ only replaces %ent; entities. - def unnormalized - v = value() - return nil if v.nil? - @unnormalized = Text::unnormalize(v, parent) - @unnormalized - end + # Evaluates to the unnormalized value of this entity; that is, replacing + # all entities -- both %ent; and &ent; entities. This differs from + # +value()+ in that +value+ only replaces %ent; entities. + def unnormalized + document.record_entity_expansion unless document.nil? + v = value() + return nil if v.nil? + @unnormalized = Text::unnormalize(v, parent) + @unnormalized + end - #once :unnormalized + #once :unnormalized - # Returns the value of this entity unprocessed -- raw. This is the - # normalized value; that is, with all %ent; and &ent; entities intact - def normalized - @value - end + # Returns the value of this entity unprocessed -- raw. This is the + # normalized value; that is, with all %ent; and &ent; entities intact + def normalized + @value + end - # Write out a fully formed, correct entity definition (assuming the Entity - # object itself is valid.) + # Write out a fully formed, correct entity definition (assuming the Entity + # object itself is valid.) # # out:: # An object implementing <TT><<<TT> to which the entity will be # output # indent:: # *DEPRECATED* and ignored - def write out, indent=-1 - out << '<!ENTITY ' - out << '% ' if @reference - out << @name - out << ' ' - if @external - out << @external << ' ' - if @pubid - q = @pubid.include?('"')?"'":'"' - out << q << @pubid << q << ' ' - end - q = @ref.include?('"')?"'":'"' - out << q << @ref << q - out << ' NDATA ' << @ndata if @ndata - else - q = @value.include?('"')?"'":'"' - out << q << @value << q - end - out << '>' - end + def write out, indent=-1 + out << '<!ENTITY ' + out << '% ' if @reference + out << @name + out << ' ' + if @external + out << @external << ' ' + if @pubid + q = @pubid.include?('"')?"'":'"' + out << q << @pubid << q << ' ' + end + q = @ref.include?('"')?"'":'"' + out << q << @ref << q + out << ' NDATA ' << @ndata if @ndata + else + q = @value.include?('"')?"'":'"' + out << q << @value << q + end + out << '>' + end - # Returns this entity as a string. See write(). - def to_s - rv = '' - write rv - rv - end + # Returns this entity as a string. See write(). + def to_s + rv = '' + write rv + rv + end - PEREFERENCE_RE = /#{PEREFERENCE}/um - # Returns the value of this entity. At the moment, only internal entities - # are processed. If the value contains internal references (IE, - # %blah;), those are replaced with their values. IE, if the doctype - # contains: - # <!ENTITY % foo "bar"> - # <!ENTITY yada "nanoo %foo; nanoo> - # then: - # doctype.entity('yada').value #-> "nanoo bar nanoo" - def value - if @value - matches = @value.scan(PEREFERENCE_RE) - rv = @value.clone - if @parent - matches.each do |entity_reference| - entity_value = @parent.entity( entity_reference[0] ) - rv.gsub!( /%#{entity_reference};/um, entity_value ) - end - end - return rv - end - nil - end - end + PEREFERENCE_RE = /#{PEREFERENCE}/um + # Returns the value of this entity. At the moment, only internal entities + # are processed. If the value contains internal references (IE, + # %blah;), those are replaced with their values. IE, if the doctype + # contains: + # <!ENTITY % foo "bar"> + # <!ENTITY yada "nanoo %foo; nanoo> + # then: + # doctype.entity('yada').value #-> "nanoo bar nanoo" + def value + if @value + matches = @value.scan(PEREFERENCE_RE) + rv = @value.clone + if @parent + sum = 0 + matches.each do |entity_reference| + entity_value = @parent.entity( entity_reference[0] ) + if sum + entity_value.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += entity_value.bytesize + end + rv.gsub!( /%#{entity_reference.join};/um, entity_value ) + end + end + return rv + end + nil + end + end - # This is a set of entity constants -- the ones defined in the XML - # specification. These are +gt+, +lt+, +amp+, +quot+ and +apos+. - module EntityConst - # +>+ - GT = Entity.new( 'gt', '>' ) - # +<+ - LT = Entity.new( 'lt', '<' ) - # +&+ - AMP = Entity.new( 'amp', '&' ) - # +"+ - QUOT = Entity.new( 'quot', '"' ) - # +'+ - APOS = Entity.new( 'apos', "'" ) - end + # This is a set of entity constants -- the ones defined in the XML + # specification. These are +gt+, +lt+, +amp+, +quot+ and +apos+. + # CAUTION: these entities does not have parent and document + module EntityConst + # +>+ + GT = Entity.new( 'gt', '>' ) + # +<+ + LT = Entity.new( 'lt', '<' ) + # +&+ + AMP = Entity.new( 'amp', '&' ) + # +"+ + QUOT = Entity.new( 'quot', '"' ) + # +'+ + APOS = Entity.new( 'apos', "'" ) + end end diff --git a/lib/rexml/formatters/default.rb b/lib/rexml/formatters/default.rb index 77381bdf84..c375f1468b 100644 --- a/lib/rexml/formatters/default.rb +++ b/lib/rexml/formatters/default.rb @@ -1,7 +1,8 @@ +# frozen_string_literal: false module REXML module Formatters class Default - # Prints out the XML document with no formatting -- except if id_hack is + # Prints out the XML document with no formatting -- except if ie_hack is # set. # # ie_hack:: @@ -21,8 +22,8 @@ module REXML def write( node, output ) case node - when Document - if node.xml_decl.encoding != "UTF-8" && !output.kind_of?(Output) + when Document + if node.xml_decl.encoding != 'UTF-8' && !output.kind_of?(Output) output = Output.new( output, node.xml_decl.encoding ) end write_document( node, output ) @@ -63,14 +64,16 @@ module REXML def write_element( node, output ) output << "<#{node.expanded_name}" - node.attributes.each_attribute do |attr| + node.attributes.to_a.map { |a| + Hash === a ? a.values : a + }.flatten.sort_by {|attr| attr.name}.each do |attr| output << " " attr.write( output ) end unless node.attributes.empty? if node.children.empty? output << " " if @ie_hack - output << "/" + output << "/" else output << ">" node.children.each { |child| diff --git a/lib/rexml/formatters/pretty.rb b/lib/rexml/formatters/pretty.rb index d21175d34a..a80274bdad 100644 --- a/lib/rexml/formatters/pretty.rb +++ b/lib/rexml/formatters/pretty.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/formatters/default' module REXML @@ -24,13 +25,14 @@ module REXML # is undefined. Defaults to 2. # ie_hack:: # If true, the printer will insert whitespace before closing empty - # tags, thereby allowing Internet Explorer's feeble XML parser to + # tags, thereby allowing Internet Explorer's XML parser to # function. Defaults to false. def initialize( indentation=2, ie_hack=false ) @indentation = indentation @level = 0 @ie_hack = ie_hack @width = 80 + @compact = false end protected @@ -47,7 +49,7 @@ module REXML if @ie_hack output << " " end - output << "/" + output << "/" else output << ">" # If compact and all children are text, and if the formatted output @@ -87,7 +89,7 @@ module REXML s = node.to_s() s.gsub!(/\s/,' ') s.squeeze!(" ") - s = wrap(s, 80-@level) + s = wrap(s, @width - @level) s = indent_text(s, @level, " ", true) output << (' '*@level + s) end @@ -125,10 +127,13 @@ module REXML end def wrap(string, width) - # Recursively wrap string at width. - return string if string.length <= width - place = string.rindex(' ', width) # Position in string with last ' ' before cutoff - return string[0,place] + "\n" + wrap(string[place+1..-1], width) + parts = [] + while string.length > width and place = string.rindex(' ', width) + parts << string[0...place] + string = string[place+1..-1] + end + parts << string + parts.join("\n") end end diff --git a/lib/rexml/formatters/transitive.rb b/lib/rexml/formatters/transitive.rb index 1d80f21fbb..81e67f3274 100644 --- a/lib/rexml/formatters/transitive.rb +++ b/lib/rexml/formatters/transitive.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/formatters/pretty' module REXML @@ -12,9 +13,10 @@ module REXML # formatted. Since this formatter does not alter whitespace nodes, the # results of formatting already formatted XML will be odd. class Transitive < Default - def initialize( indentation=2 ) + def initialize( indentation=2, ie_hack=false ) @indentation = indentation @level = 0 + @ie_hack = ie_hack end protected @@ -29,13 +31,13 @@ module REXML output << "\n" output << ' '*@level if node.children.empty? - output << "/" + output << " " if @ie_hack + output << "/" else output << ">" # If compact and all children are text, and if the formatted output # is less than the specified width, then try to print everything on # one line - skip = false @level += @indentation node.children.each { |child| write( child, output ) diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb index fefc9ef940..cd879fdd28 100644 --- a/lib/rexml/functions.rb +++ b/lib/rexml/functions.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false module REXML # If you add a method, keep in mind two things: # (1) the first argument will always be a list of nodes from which to @@ -7,10 +8,28 @@ module REXML # Therefore, in XML, "local-name()" is identical (and actually becomes) # "local_name()" module Functions + @@available_functions = {} @@context = nil @@namespace_context = {} @@variables = {} + INTERNAL_METHODS = [ + :namespace_context, + :namespace_context=, + :variables, + :variables=, + :context=, + :get_namespace, + :send, + ] + class << self + def singleton_method_added(name) + unless INTERNAL_METHODS.include?(name) + @@available_functions[name] = true + end + end + end + def Functions::namespace_context=(x) ; @@namespace_context=x ; end def Functions::variables=(x) ; @@variables=x ; end def Functions::namespace_context ; @@namespace_context ; end @@ -28,6 +47,7 @@ module REXML end end + # Returns the last node of the given list of nodes. def Functions::last( ) @@context[:size] end @@ -36,6 +56,7 @@ module REXML @@context[:index] end + # Returns the size of the given list of nodes. def Functions::count( node_set ) node_set.size end @@ -48,7 +69,7 @@ module REXML # UNTESTED def Functions::local_name( node_set=nil ) get_namespace( node_set ) do |node| - return node.local_name + return node.local_name end end @@ -57,7 +78,7 @@ module REXML end def Functions::name( node_set=nil ) - get_namespace( node_set ) do |node| + get_namespace( node_set ) do |node| node.expanded_name end end @@ -66,7 +87,7 @@ module REXML def Functions::get_namespace( node_set = nil ) if node_set == nil yield @@context[:node] if defined? @@context[:node].namespace - else + else if node_set.respond_to? :each node_set.each { |node| yield node if defined? node.namespace } elsif node_set.respond_to? :namespace @@ -81,15 +102,15 @@ module REXML # # A number is converted to a string as follows # - # NaN is converted to the string NaN + # NaN is converted to the string NaN # - # positive zero is converted to the string 0 + # positive zero is converted to the string 0 # - # negative zero is converted to the string 0 + # negative zero is converted to the string 0 # - # positive infinity is converted to the string Infinity + # positive infinity is converted to the string Infinity # - # negative infinity is converted to the string -Infinity + # negative infinity is converted to the string -Infinity # # if the number is an integer, the number is represented in decimal form # as a Number with no decimal point and no leading zeros, preceded by a @@ -129,6 +150,11 @@ module REXML end end + # A node-set is converted to a string by + # returning the concatenation of the string-value + # of each of the children of the node in the + # node-set that is first in document order. + # If the node-set is empty, an empty string is returned. def Functions::string_value( o ) rv = "" o.children.each { |e| @@ -156,7 +182,7 @@ module REXML string(string).include?(string(test)) end - # Kouhei fixed this + # Kouhei fixed this def Functions::substring_before( string, test ) ruby_string = string(string) ruby_index = ruby_string.index(string(test)) @@ -166,20 +192,19 @@ module REXML ruby_string[ 0...ruby_index ] end end - + # Kouhei fixed this too def Functions::substring_after( string, test ) ruby_string = string(string) - test_string = string(test) return $1 if ruby_string =~ /#{test}(.*)/ "" end - # Take equal portions of Mike Stok and Sean Russell; mix + # Take equal portions of Mike Stok and Sean Russell; mix # vigorously, and pour into a tall, chilled glass. Serves 10,000. def Functions::substring( string, start, length=nil ) ruby_string = string(string) - ruby_length = if length.nil? + ruby_length = if length.nil? ruby_string.length.to_f else number(length) @@ -188,18 +213,18 @@ module REXML # Handle the special cases return '' if ( - ruby_length.nan? or + ruby_length.nan? or ruby_start.nan? or ruby_start.infinite? ) infinite_length = ruby_length.infinite? == 1 ruby_length = ruby_string.length if infinite_length - - # Now, get the bounds. The XPath bounds are 1..length; the ruby bounds + + # Now, get the bounds. The XPath bounds are 1..length; the ruby bounds # are 0..length. Therefore, we have to offset the bounds by one. - ruby_start = ruby_start.round - 1 - ruby_length = ruby_length.round + ruby_start = round(ruby_start) - 1 + ruby_length = round(ruby_length) if ruby_start < 0 ruby_length += ruby_start unless infinite_length @@ -247,7 +272,7 @@ module REXML 0.upto(from.length - 1) { |pos| from_char = from[pos] unless map.has_key? from_char - map[from_char] = + map[from_char] = if pos < to.length to[pos] else @@ -256,9 +281,15 @@ module REXML end } - string(string).unpack('U*').collect { |c| - if map.has_key? c then map[c] else c end - }.compact.pack('U*') + if ''.respond_to? :chars + string(string).chars.collect { |c| + if map.has_key? c then map[c] else c end + }.compact.join + else + string(string).unpack('U*').collect { |c| + if map.has_key? c then map[c] else c end + }.compact.pack('U*') + end end # UNTESTED @@ -351,9 +382,9 @@ module REXML def Functions::sum( nodes ) nodes = [nodes] unless nodes.kind_of? Array - nodes.inject(0) { |r,n| r += number(string(n)) } + nodes.inject(0) { |r,n| r + number(string(n)) } end - + def Functions::floor( number ) number(number).floor end @@ -363,10 +394,13 @@ module REXML end def Functions::round( number ) + number = number(number) begin - number(number).round + neg = number.negative? + number = number.abs.round(half: :up) + neg ? -number : number rescue FloatDomainError - number(number) + number end end @@ -374,9 +408,14 @@ module REXML node.node_type == :processing_instruction end - def Functions::method_missing( id ) - puts "METHOD MISSING #{id.id2name}" - XPath.match( @@context[:node], id.id2name ) + def Functions::send(name, *args) + if @@available_functions[name.to_sym] + super + else + # TODO: Maybe, this is not XPath spec behavior. + # This behavior must be reconsidered. + XPath.match(@@context[:node], name.to_s) + end end end end diff --git a/lib/rexml/instruction.rb b/lib/rexml/instruction.rb index c16b894b4a..c4f65eefc1 100644 --- a/lib/rexml/instruction.rb +++ b/lib/rexml/instruction.rb @@ -1,63 +1,64 @@ +# frozen_string_literal: false require "rexml/child" require "rexml/source" module REXML - # Represents an XML Instruction; IE, <? ... ?> - # TODO: Add parent arg (3rd arg) to constructor - class Instruction < Child - START = '<\?' - STOP = '\?>' + # Represents an XML Instruction; IE, <? ... ?> + # TODO: Add parent arg (3rd arg) to constructor + class Instruction < Child + START = '<\?' + STOP = '\?>' - # target is the "name" of the Instruction; IE, the "tag" in <?tag ...?> - # content is everything else. - attr_accessor :target, :content + # target is the "name" of the Instruction; IE, the "tag" in <?tag ...?> + # content is everything else. + attr_accessor :target, :content - # Constructs a new Instruction - # @param target can be one of a number of things. If String, then - # the target of this instruction is set to this. If an Instruction, - # then the Instruction is shallowly cloned (target and content are - # copied). If a Source, then the source is scanned and parsed for - # an Instruction declaration. - # @param content Must be either a String, or a Parent. Can only - # be a Parent if the target argument is a Source. Otherwise, this - # String is set as the content of this instruction. - def initialize(target, content=nil) - if target.kind_of? String - super() - @target = target - @content = content - elsif target.kind_of? Instruction - super(content) - @target = target.target - @content = target.content - end - @content.strip! if @content - end + # Constructs a new Instruction + # @param target can be one of a number of things. If String, then + # the target of this instruction is set to this. If an Instruction, + # then the Instruction is shallowly cloned (target and content are + # copied). If a Source, then the source is scanned and parsed for + # an Instruction declaration. + # @param content Must be either a String, or a Parent. Can only + # be a Parent if the target argument is a Source. Otherwise, this + # String is set as the content of this instruction. + def initialize(target, content=nil) + if target.kind_of? String + super() + @target = target + @content = content + elsif target.kind_of? Instruction + super(content) + @target = target.target + @content = target.content + end + @content.strip! if @content + end + + def clone + Instruction.new self + end - def clone - Instruction.new self - end - # == DEPRECATED # See the rexml/formatters package # - def write writer, indent=-1, transitive=false, ie_hack=false - Kernel.warn( "#{self.class.name}.write is deprecated" ) - indent(writer, indent) - writer << START.sub(/\\/u, '') - writer << @target - writer << ' ' - writer << @content - writer << STOP.sub(/\\/u, '') - end + def write writer, indent=-1, transitive=false, ie_hack=false + Kernel.warn( "#{self.class.name}.write is deprecated", uplevel: 1) + indent(writer, indent) + writer << START.sub(/\\/u, '') + writer << @target + writer << ' ' + writer << @content + writer << STOP.sub(/\\/u, '') + end - # @return true if other is an Instruction, and the content and target - # of the other matches the target and content of this object. - def ==( other ) - other.kind_of? Instruction and - other.target == @target and - other.content == @content - end + # @return true if other is an Instruction, and the content and target + # of the other matches the target and content of this object. + def ==( other ) + other.kind_of? Instruction and + other.target == @target and + other.content == @content + end def node_type :processing_instruction @@ -66,5 +67,5 @@ module REXML def inspect "<?p-i #{target} ...?>" end - end + end end diff --git a/lib/rexml/light/node.rb b/lib/rexml/light/node.rb index 943ec5f1a0..d58119a3a4 100644 --- a/lib/rexml/light/node.rb +++ b/lib/rexml/light/node.rb @@ -1,196 +1,196 @@ +# frozen_string_literal: false require 'rexml/xmltokens' -require 'rexml/light/node' # [ :element, parent, name, attributes, children* ] - # a = Node.new - # a << "B" # => <a>B</a> - # a.b # => <a>B<b/></a> - # a.b[1] # => <a>B<b/><b/><a> - # a.b[1]["x"] = "y" # => <a>B<b/><b x="y"/></a> - # a.b[0].c # => <a>B<b><c/></b><b x="y"/></a> - # a.b.c << "D" # => <a>B<b><c>D</c></b><b x="y"/></a> + # a = Node.new + # a << "B" # => <a>B</a> + # a.b # => <a>B<b/></a> + # a.b[1] # => <a>B<b/><b/><a> + # a.b[1]["x"] = "y" # => <a>B<b/><b x="y"/></a> + # a.b[0].c # => <a>B<b><c/></b><b x="y"/></a> + # a.b.c << "D" # => <a>B<b><c>D</c></b><b x="y"/></a> module REXML - module Light - # Represents a tagged XML element. Elements are characterized by - # having children, attributes, and names, and can themselves be - # children. - class Node - NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u - PARENTS = [ :element, :document, :doctype ] - # Create a new element. - def initialize node=nil - @node = node - if node.kind_of? String - node = [ :text, node ] - elsif node.nil? - node = [ :document, nil, nil ] - elsif node[0] == :start_element - node[0] = :element - elsif node[0] == :start_doctype - node[0] = :doctype - elsif node[0] == :start_document - node[0] = :document - end - end - - def size - if PARENTS.include? @node[0] - @node[-1].size - else - 0 - end - end - - def each( &block ) - size.times { |x| yield( at(x+4) ) } - end - - def name - at(2) - end - - def name=( name_str, ns=nil ) - pfx = '' - pfx = "#{prefix(ns)}:" if ns - _old_put(2, "#{pfx}#{name_str}") - end - - def parent=( node ) - _old_put(1,node) - end - - def local_name - namesplit - @name - end - - def local_name=( name_str ) - _old_put( 1, "#@prefix:#{name_str}" ) - end - - def prefix( namespace=nil ) - prefix_of( self, namespace ) - end - - def namespace( prefix=prefix() ) - namespace_of( self, prefix ) - end - - def namespace=( namespace ) - @prefix = prefix( namespace ) - pfx = '' - pfx = "#@prefix:" if @prefix.size > 0 - _old_put(1, "#{pfx}#@name") - end - - def []( reference, ns=nil ) - if reference.kind_of? String - pfx = '' - pfx = "#{prefix(ns)}:" if ns - at(3)["#{pfx}#{reference}"] - elsif reference.kind_of? Range - _old_get( Range.new(4+reference.begin, reference.end, reference.exclude_end?) ) - else - _old_get( 4+reference ) - end - end - - def =~( path ) - XPath.match( self, path ) - end - - # Doesn't handle namespaces yet - def []=( reference, ns, value=nil ) - if reference.kind_of? String - value = ns unless value - at( 3 )[reference] = value - elsif reference.kind_of? Range - _old_put( Range.new(3+reference.begin, reference.end, reference.exclude_end?), ns ) - else - if value - _old_put( 4+reference, ns, value ) - else - _old_put( 4+reference, ns ) - end - end - end - - # Append a child to this element, optionally under a provided namespace. - # The namespace argument is ignored if the element argument is an Element - # object. Otherwise, the element argument is a string, the namespace (if - # provided) is the namespace the element is created in. - def << element - if node_type() == :text - at(-1) << element - else - newnode = Node.new( element ) - newnode.parent = self - self.push( newnode ) - end - at(-1) - end - - def node_type - _old_get(0) - end - - def text=( foo ) - replace = at(4).kind_of?(String)? 1 : 0 - self._old_put(4,replace, normalizefoo) - end - - def root - context = self - context = context.at(1) while context.at(1) - end - - def has_name?( name, namespace = '' ) - at(3) == name and namespace() == namespace - end - - def children - self - end - - def parent - at(1) - end - - def to_s - - end - - private - - def namesplit - return if @name.defined? - at(2) =~ NAMESPLIT - @prefix = '' || $1 - @name = $2 - end - - def namespace_of( node, prefix=nil ) - if not prefix - name = at(2) - name =~ NAMESPLIT - prefix = $1 - end - to_find = 'xmlns' - to_find = "xmlns:#{prefix}" if not prefix.nil? - ns = at(3)[ to_find ] - ns ? ns : namespace_of( @node[0], prefix ) - end - - def prefix_of( node, namespace=nil ) - if not namespace - name = node.name - name =~ NAMESPLIT - $1 - else - ns = at(3).find { |k,v| v == namespace } - ns ? ns : prefix_of( node.parent, namespace ) - end - end - end - end + module Light + # Represents a tagged XML element. Elements are characterized by + # having children, attributes, and names, and can themselves be + # children. + class Node + NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u + PARENTS = [ :element, :document, :doctype ] + # Create a new element. + def initialize node=nil + @node = node + if node.kind_of? String + node = [ :text, node ] + elsif node.nil? + node = [ :document, nil, nil ] + elsif node[0] == :start_element + node[0] = :element + elsif node[0] == :start_doctype + node[0] = :doctype + elsif node[0] == :start_document + node[0] = :document + end + end + + def size + if PARENTS.include? @node[0] + @node[-1].size + else + 0 + end + end + + def each + size.times { |x| yield( at(x+4) ) } + end + + def name + at(2) + end + + def name=( name_str, ns=nil ) + pfx = '' + pfx = "#{prefix(ns)}:" if ns + _old_put(2, "#{pfx}#{name_str}") + end + + def parent=( node ) + _old_put(1,node) + end + + def local_name + namesplit + @name + end + + def local_name=( name_str ) + _old_put( 1, "#@prefix:#{name_str}" ) + end + + def prefix( namespace=nil ) + prefix_of( self, namespace ) + end + + def namespace( prefix=prefix() ) + namespace_of( self, prefix ) + end + + def namespace=( namespace ) + @prefix = prefix( namespace ) + pfx = '' + pfx = "#@prefix:" if @prefix.size > 0 + _old_put(1, "#{pfx}#@name") + end + + def []( reference, ns=nil ) + if reference.kind_of? String + pfx = '' + pfx = "#{prefix(ns)}:" if ns + at(3)["#{pfx}#{reference}"] + elsif reference.kind_of? Range + _old_get( Range.new(4+reference.begin, reference.end, reference.exclude_end?) ) + else + _old_get( 4+reference ) + end + end + + def =~( path ) + XPath.match( self, path ) + end + + # Doesn't handle namespaces yet + def []=( reference, ns, value=nil ) + if reference.kind_of? String + value = ns unless value + at( 3 )[reference] = value + elsif reference.kind_of? Range + _old_put( Range.new(3+reference.begin, reference.end, reference.exclude_end?), ns ) + else + if value + _old_put( 4+reference, ns, value ) + else + _old_put( 4+reference, ns ) + end + end + end + + # Append a child to this element, optionally under a provided namespace. + # The namespace argument is ignored if the element argument is an Element + # object. Otherwise, the element argument is a string, the namespace (if + # provided) is the namespace the element is created in. + def << element + if node_type() == :text + at(-1) << element + else + newnode = Node.new( element ) + newnode.parent = self + self.push( newnode ) + end + at(-1) + end + + def node_type + _old_get(0) + end + + def text=( foo ) + replace = at(4).kind_of?(String)? 1 : 0 + self._old_put(4,replace, normalizefoo) + end + + def root + context = self + context = context.at(1) while context.at(1) + end + + def has_name?( name, namespace = '' ) + at(3) == name and namespace() == namespace + end + + def children + self + end + + def parent + at(1) + end + + def to_s + + end + + private + + def namesplit + return if @name.defined? + at(2) =~ NAMESPLIT + @prefix = '' || $1 + @name = $2 + end + + def namespace_of( node, prefix=nil ) + if not prefix + name = at(2) + name =~ NAMESPLIT + prefix = $1 + end + to_find = 'xmlns' + to_find = "xmlns:#{prefix}" if not prefix.nil? + ns = at(3)[ to_find ] + ns ? ns : namespace_of( @node[0], prefix ) + end + + def prefix_of( node, namespace=nil ) + if not namespace + name = node.name + name =~ NAMESPLIT + $1 + else + ns = at(3).find { |k,v| v == namespace } + ns ? ns : prefix_of( node.parent, namespace ) + end + end + end + end end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb index 3e8790580b..90ba7cc635 100644 --- a/lib/rexml/namespace.rb +++ b/lib/rexml/namespace.rb @@ -1,47 +1,48 @@ +# frozen_string_literal: false require 'rexml/xmltokens' module REXML - # Adds named attributes to an object. - module Namespace - # The name of the object, valid if set - attr_reader :name, :expanded_name - # The expanded name of the object, valid if name is set - attr_accessor :prefix - include XMLTokens - NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u + # Adds named attributes to an object. + module Namespace + # The name of the object, valid if set + attr_reader :name, :expanded_name + # The expanded name of the object, valid if name is set + attr_accessor :prefix + include XMLTokens + NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u - # Sets the name and the expanded name - def name=( name ) - @expanded_name = name - name =~ NAMESPLIT - if $1 - @prefix = $1 - else - @prefix = "" - @namespace = "" - end - @name = $2 - end + # Sets the name and the expanded name + def name=( name ) + @expanded_name = name + name =~ NAMESPLIT + if $1 + @prefix = $1 + else + @prefix = "" + @namespace = "" + end + @name = $2 + end - # Compares names optionally WITH namespaces - def has_name?( other, ns=nil ) - if ns - return (namespace() == ns and name() == other) - elsif other.include? ":" - return fully_expanded_name == other - else - return name == other - end - end + # Compares names optionally WITH namespaces + def has_name?( other, ns=nil ) + if ns + return (namespace() == ns and name() == other) + elsif other.include? ":" + return fully_expanded_name == other + else + return name == other + end + end - alias :local_name :name + alias :local_name :name - # Fully expand the name, even if the prefix wasn't specified in the - # source file. - def fully_expanded_name - ns = prefix - return "#{ns}:#@name" if ns.size > 0 - return @name - end - end + # Fully expand the name, even if the prefix wasn't specified in the + # source file. + def fully_expanded_name + ns = prefix + return "#{ns}:#@name" if ns.size > 0 + return @name + end + end end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb index d5e8456e53..52337ade44 100644 --- a/lib/rexml/node.rb +++ b/lib/rexml/node.rb @@ -1,31 +1,32 @@ +# frozen_string_literal: false require "rexml/parseexception" require "rexml/formatters/pretty" require "rexml/formatters/default" module REXML - # Represents a node in the tree. Nodes are never encountered except as - # superclasses of other objects. Nodes have siblings. - module Node - # @return the next sibling (nil if unset) - def next_sibling_node - return nil if @parent.nil? - @parent[ @parent.index(self) + 1 ] - end + # Represents a node in the tree. Nodes are never encountered except as + # superclasses of other objects. Nodes have siblings. + module Node + # @return the next sibling (nil if unset) + def next_sibling_node + return nil if @parent.nil? + @parent[ @parent.index(self) + 1 ] + end - # @return the previous sibling (nil if unset) - def previous_sibling_node - return nil if @parent.nil? - ind = @parent.index(self) - return nil if ind == 0 - @parent[ ind - 1 ] - end + # @return the previous sibling (nil if unset) + def previous_sibling_node + return nil if @parent.nil? + ind = @parent.index(self) + return nil if ind == 0 + @parent[ ind - 1 ] + end # indent:: # *DEPRECATED* This parameter is now ignored. See the formatters in the # REXML::Formatters package for changing the output style. - def to_s indent=nil + def to_s indent=nil unless indent.nil? - Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated" ) + Kernel.warn( "#{self.class.name}.to_s(indent) parameter is deprecated", uplevel: 1) f = REXML::Formatters::Pretty.new( indent ) f.write( self, rv = "" ) else @@ -33,33 +34,33 @@ module REXML f.write( self, rv = "" ) end return rv - end + end - def indent to, ind - if @parent and @parent.context and not @parent.context[:indentstyle].nil? then - indentstyle = @parent.context[:indentstyle] - else - indentstyle = ' ' - end - to << indentstyle*ind unless ind<1 - end + def indent to, ind + if @parent and @parent.context and not @parent.context[:indentstyle].nil? then + indentstyle = @parent.context[:indentstyle] + else + indentstyle = ' ' + end + to << indentstyle*ind unless ind<1 + end - def parent? - false; - end + def parent? + false; + end - # Visit all subnodes of +self+ recursively - def each_recursive(&block) # :yields: node - self.elements.each {|node| - block.call(node) - node.each_recursive(&block) - } - end + # Visit all subnodes of +self+ recursively + def each_recursive(&block) # :yields: node + self.elements.each {|node| + block.call(node) + node.each_recursive(&block) + } + end - # Find (and return) first subnode (recursively) for which the block + # Find (and return) first subnode (recursively) for which the block # evaluates to true. Returns +nil+ if none was found. - def find_first_recursive(&block) # :yields: node + def find_first_recursive(&block) # :yields: node each_recursive {|node| return node if block.call(node) } @@ -71,5 +72,5 @@ module REXML def index_in_parent parent.index(self)+1 end - end + end end diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb index be4d23d42d..96dfea570e 100644 --- a/lib/rexml/output.rb +++ b/lib/rexml/output.rb @@ -1,24 +1,30 @@ +# frozen_string_literal: false require 'rexml/encoding' module REXML - class Output - include Encoding - + class Output + include Encoding + attr_reader :encoding - def initialize real_IO, encd="iso-8859-1" - @output = real_IO - self.encoding = encd + def initialize real_IO, encd="iso-8859-1" + @output = real_IO + self.encoding = encd + + @to_utf = encoding != 'UTF-8' - @to_utf = encd == UTF_8 ? false : true - end + if encoding == "UTF-16" + @output << "\ufeff".encode("UTF-16BE") + self.encoding = "UTF-16BE" + end + end - def <<( content ) - @output << (@to_utf ? self.encode(content) : content) - end + def <<( content ) + @output << (@to_utf ? self.encode(content) : content) + end def to_s "Output[#{encoding}]" end - end + end end diff --git a/lib/rexml/parent.rb b/lib/rexml/parent.rb index a20aaaef6b..3bd0a96255 100644 --- a/lib/rexml/parent.rb +++ b/lib/rexml/parent.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require "rexml/child" module REXML @@ -6,62 +7,61 @@ module REXML # object. class Parent < Child include Enumerable - + # Constructor # @param parent if supplied, will be set as the parent of this object def initialize parent=nil super(parent) @children = [] end - + def add( object ) - #puts "PARENT GOTS #{size} CHILDREN" object.parent = self @children << object - #puts "PARENT NOW GOTS #{size} CHILDREN" object end - + alias :push :add alias :<< :push - + def unshift( object ) object.parent = self @children.unshift object end - + def delete( object ) found = false @children.delete_if {|c| c.equal?(object) and found = true } object.parent = nil if found + found ? object : nil end - + def each(&block) @children.each(&block) end - + def delete_if( &block ) @children.delete_if(&block) end - + def delete_at( index ) @children.delete_at index end - + def each_index( &block ) @children.each_index(&block) end - + # Fetches a child at a given index # @param index the Integer index of the child to fetch def []( index ) @children[index] end - + alias :each_child :each - - - + + + # Set an index entry. See Array.[]= # @param index the index of the element to set # @param opt either the object to set, or an Integer length @@ -71,7 +71,7 @@ module REXML args[-1].parent = self @children[*args[0..-2]] = args[-1] end - + # Inserts an child before another child # @param child1 this is either an xpath or an Element. If an Element, # child2 will be inserted before child1 in the child list of the parent. @@ -91,7 +91,7 @@ module REXML end self end - + # Inserts an child after another child # @param child1 this is either an xpath or an Element. If an Element, # child2 will be inserted after child1 in the child list of the parent. @@ -111,11 +111,11 @@ module REXML end self end - + def to_a @children.dup end - + # Fetches the index of a given child # @param child the child to get the index of # @return the index of the child, or nil if the object is not a child @@ -125,24 +125,24 @@ module REXML @children.find { |i| count += 1 ; i.hash == child.hash } count end - + # @return the number of children of this parent def size @children.size end - + alias :length :size - + # Replaces one child with another, making sure the nodelist is correct # @param to_replace the child to replace (must be a Child) - # @param replacement the child to insert into the nodelist (must be a + # @param replacement the child to insert into the nodelist (must be a # Child) def replace_child( to_replace, replacement ) @children.map! {|c| c.equal?( to_replace ) ? replacement : c } to_replace.parent = nil replacement.parent = self end - + # Deeply clones this object. This creates a complete duplicate of this # Parent, including all descendants. def deep_clone @@ -156,9 +156,9 @@ module REXML end cl end - + alias :children :to_a - + def parent? true end diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb index feb7a7e638..7b16cd1a41 100644 --- a/lib/rexml/parseexception.rb +++ b/lib/rexml/parseexception.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false module REXML class ParseException < RuntimeError attr_accessor :source, :parser, :continued_exception @@ -28,9 +29,9 @@ module REXML err << "\nLine: #{line}\n" err << "Position: #{position}\n" err << "Last 80 unconsumed characters:\n" - err << @source.buffer[0..80].gsub(/\n/, ' ') + err << @source.buffer[0..80].force_encoding("ASCII-8BIT").gsub(/\n/, ' ') end - + err end @@ -40,12 +41,12 @@ module REXML end def line - @source.current_line[2] if @source and defined? @source.current_line and + @source.current_line[2] if @source and defined? @source.current_line and @source.current_line end def context @source.current_line end - end + end end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index fc2354a67f..e7ef695912 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -1,3 +1,7 @@ +# frozen_string_literal: false + +require "strscan" + require 'rexml/parseexception' require 'rexml/undefinednamespaceexception' require 'rexml/source' @@ -25,41 +29,50 @@ module REXML # # Nat Price gave me some good ideas for the API. class BaseParser - NCNAME_STR= '[\w:][\-\w\d.]*' - NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" + LETTER = '[:alpha:]' + DIGIT = '[:digit:]' + + COMBININGCHAR = '' # TODO + EXTENDER = '' # TODO + + NCNAME_STR= "[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*" + QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" + QNAME = /(#{QNAME_STR})/ + + # Just for backward compatibility. For example, kramdown uses this. + # It's not used in REXML. UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" - NAMECHAR = '[\-\w\d\.:]' + NAMECHAR = '[\-\w\.:]' NAME = "([\\w:]#{NAMECHAR}*)" NMTOKEN = "(?:#{NAMECHAR})+" NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" - REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)" REFERENCE_RE = /#{REFERENCE}/ DOCTYPE_START = /\A\s*<!DOCTYPE\s/um - DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um - ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um + DOCTYPE_END = /\A\s*\]\s*>/um + ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um COMMENT_START = /\A<!--/u COMMENT_PATTERN = /<!--(.*?)-->/um CDATA_START = /\A<!\[CDATA\[/u - CDATA_END = /^\s*\]\s*>/um + CDATA_END = /\A\s*\]\s*>/um CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um XMLDECL_START = /\A<\?xml\s/u; XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um INSTRUCTION_START = /\A<\?/u INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um - TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um - CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + TAG_MATCH = /\A<((?>#{QNAME_STR}))/um + CLOSE_MATCH = /\A\s*<\/(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um - STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um + STANDALONE = /\bstandalone\s*=\s*["'](.*?)['"]/um - ENTITY_START = /^\s*<!ENTITY/ - IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u - ELEMENTDECL_START = /^\s*<!ELEMENT/um - ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um - SYSTEMENTITY = /^\s*(%.*?;)\s*$/um + ENTITY_START = /\A\s*<!ENTITY/ + ELEMENTDECL_START = /\A\s*<!ELEMENT/um + ELEMENTDECL_PATTERN = /\A\s*(<!ELEMENT.*?)>/um + SYSTEMENTITY = /\A\s*(%.*?;)\s*$/um ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" @@ -68,11 +81,8 @@ module REXML DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" ATTDEF_RE = /#{ATTDEF}/ - ATTLISTDECL_START = /^\s*<!ATTLIST/um - ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um - NOTATIONDECL_START = /^\s*<!NOTATION/um - PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um - SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um + ATTLISTDECL_START = /\A\s*<!ATTLIST/um + ATTLISTDECL_PATTERN = /\A\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um TEXT_PATTERN = /\A([^<]*)/um @@ -90,13 +100,18 @@ module REXML GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>" ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + NOTATIONDECL_START = /\A\s*<!NOTATION/um + EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um + EXTERNAL_ID_SYSTEM = /\A\s*SYSTEM\s+#{SYSTEMLITERAL}\s*/um + PUBLIC_ID = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s*/um + EREFERENCE = /&(?!#{NAME};)/ - DEFAULT_ENTITIES = { - 'gt' => [/>/, '>', '>', />/], - 'lt' => [/</, '<', '<', /</], - 'quot' => [/"/, '"', '"', /"/], - "apos" => [/'/, "'", "'", /'/] + DEFAULT_ENTITIES = { + 'gt' => [/>/, '>', '>', />/], + 'lt' => [/</, '<', '<', /</], + 'quot' => [/"/, '"', '"', /"/], + "apos" => [/'/, "'", "'", /'/] } @@ -104,26 +119,14 @@ module REXML # These are patterns to identify common markup errors, to make the # error messages more informative. ###################################################################### - MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um + MISSING_ATTRIBUTE_QUOTES = /^<#{QNAME_STR}\s+#{QNAME_STR}\s*=\s*[^"']/um def initialize( source ) self.stream = source + @listeners = [] end def add_listener( listener ) - if !defined?(@listeners) or !@listeners - @listeners = [] - instance_eval <<-EOL - alias :_old_pull :pull - def pull - event = _old_pull - @listeners.each do |listener| - listener.receive event - end - event - end - EOL - end @listeners << listener end @@ -167,9 +170,9 @@ module REXML # Peek at the +depth+ event in the stack. The first element on the stack # is at depth 0. If +depth+ is -1, will parse to the end of the input # stream and return the last event, which is always :end_document. - # Be aware that this causes the stream to be parsed up to the +depth+ - # event, so you can effectively pre-parse the entire document (pull the - # entire thing into memory) using this method. + # Be aware that this causes the stream to be parsed up to the +depth+ + # event, so you can effectively pre-parse the entire document (pull the + # entire thing into memory) using this method. def peek depth=0 raise %Q[Illegal argument "#{depth}"] if depth < -1 temp = [] @@ -186,6 +189,14 @@ module REXML # Returns the next event. This is a +PullEvent+ object. def pull + pull_event.tap do |event| + @listeners.each do |listener| + listener.receive event + end + end + end + + def pull_event if @closed x, @closed = @closed, nil return [ :end_element, x ] @@ -193,11 +204,9 @@ module REXML return [ :end_document ] if empty? return @stack.shift if @stack.size > 0 #STDERR.puts @source.encoding - @source.read if @source.buffer.size<2 #STDERR.puts "BUFFER = #{@source.buffer.inspect}" if @document_status == nil - #@source.consume( /^\s*/um ) - word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um ) + word = @source.match( /\A((?:\s+)|(?:<[^>]*>))/um ) word = word[1] unless word.nil? #STDERR.puts "WORD = #{word.inspect}" case word @@ -210,44 +219,63 @@ module REXML version = version[1] unless version.nil? encoding = ENCODING.match(results) encoding = encoding[1] unless encoding.nil? - @source.encoding = encoding + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end standalone = STANDALONE.match(results) standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] when INSTRUCTION_START return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] when DOCTYPE_START - md = @source.match( DOCTYPE_PATTERN, true ) + base_error_message = "Malformed DOCTYPE" + @source.match(DOCTYPE_START, true) @nsstack.unshift(curr_ns=Set.new) - identity = md[1] - close = md[2] - identity =~ IDENTITY - name = $1 - raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil? - pub_sys = $2.nil? ? nil : $2.strip - long_name = $4.nil? ? nil : $4.strip - uri = $6.nil? ? nil : $6.strip - args = [ :start_doctype, name, pub_sys, long_name, uri ] - if close == ">" + name = parse_name(base_error_message) + if @source.match(/\A\s*\[/um, true) + id = [nil, nil, nil] + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + id = [nil, nil, nil] @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/^\s*/um, true) - @stack << [ :end_doctype ] else - @document_status = :in_doctype + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: false) + if id[0] == "SYSTEM" + # For backward compatibility + id[1], id[2] = id[2], nil + end + if @source.match(/\A\s*\[/um, true) + @document_status = :in_doctype + elsif @source.match(/\A\s*>/um, true) + @document_status = :after_doctype + else + message = "#{base_error_message}: garbage after external ID" + raise REXML::ParseException.new(message, @source) + end + end + args = [:start_doctype, name, *id] + if @document_status == :after_doctype + @source.match(/\A\s*/um, true) + @stack << [ :end_doctype ] end return args - when /^\s+/ + when /\A\s+/ else @document_status = :after_doctype - @source.read if @source.buffer.size<2 - md = @source.match(/\s*/um, true) + if @source.encoding == "UTF-8" + @source.buffer.force_encoding(::Encoding::UTF_8) + end end end if @document_status == :in_doctype - md = @source.match(/\s*(.*?>)/um) + md = @source.match(/\A\s*(.*?>)/um) case md[1] - when SYSTEMENTITY + when SYSTEMENTITY match = @source.match( SYSTEMENTITY, true )[1] return [ :externalentity, match ] @@ -272,7 +300,8 @@ module REXML # External reference match[3] = match[3][1..-2] # PUBID match[4] = match[4][1..-2] # HREF - # match is [ :entity, name, PUBLIC, pubid, href ] + match.delete_at(5) if match.size > 5 # Chop out NDATA decl + # match is [ :entity, name, PUBLIC, pubid, href(, ndata)? ] else match[2] = match[2][1..-2] match.pop if match.size == 4 @@ -301,33 +330,50 @@ module REXML end return [ :attlistdecl, element, pairs, contents ] when NOTATIONDECL_START - md = nil - if @source.match( PUBLIC ) - md = @source.match( PUBLIC, true ) - vals = [md[1],md[2],md[4],md[6]] - elsif @source.match( SYSTEM ) - md = @source.match( SYSTEM, true ) - vals = [md[1],md[2],nil,md[4]] - else - raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) + base_error_message = "Malformed notation declaration" + unless @source.match(/\A\s*<!NOTATION\s+/um, true) + if @source.match(/\A\s*<!NOTATION\s*>/um) + message = "#{base_error_message}: name is missing" + else + message = "#{base_error_message}: invalid declaration name" + end + raise REXML::ParseException.new(message, @source) end - return [ :notationdecl, *vals ] - when CDATA_END + name = parse_name(base_error_message) + id = parse_id(base_error_message, + accept_external_id: true, + accept_public_id: true) + unless @source.match(/\A\s*>/um, true) + message = "#{base_error_message}: garbage before end >" + raise REXML::ParseException.new(message, @source) + end + return [:notationdecl, name, *id] + when DOCTYPE_END @document_status = :after_doctype - @source.match( CDATA_END, true ) + @source.match( DOCTYPE_END, true ) return [ :end_doctype ] end end + if @document_status == :after_doctype + @source.match(/\A\s*/um, true) + end begin + @source.read if @source.buffer.size<2 if @source.buffer[0] == ?< if @source.buffer[1] == ?/ @nsstack.shift last_tag = @tags.pop #md = @source.match_to_consume( '>', CLOSE_MATCH) md = @source.match( CLOSE_MATCH, true ) - raise REXML::ParseException.new( "Missing end tag for "+ - "'#{last_tag}' (got \"#{md[1]}\")", - @source) unless last_tag == md[1] + if md and !last_tag + message = "Unexpected top-level end tag (got '#{md[1]}')" + raise REXML::ParseException.new(message, @source) + end + if md.nil? or last_tag != md[1] + message = "Missing end tag for '#{last_tag}'" + message << " (got '#{md[1]}')" if md + raise REXML::ParseException.new(message, @source) + end return [ :end_element, last_tag ] elsif @source.buffer[1] == ?! md = @source.match(/\A(\s*[^>]*>)/um) @@ -335,6 +381,12 @@ module REXML raise REXML::ParseException.new("Malformed node", @source) unless md if md[0][2] == ?- md = @source.match( COMMENT_PATTERN, true ) + + case md[1] + when /--/, /-\z/ + raise REXML::ParseException.new("Malformed comment", @source) + end + return [ :comment, md[1] ] if md else md = @source.match( CDATA_PATTERN, true ) @@ -353,36 +405,13 @@ module REXML unless md # Check for missing attribute quotes raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) - raise REXML::ParseException.new("malformed XML: missing tag start", @source) + raise REXML::ParseException.new("malformed XML: missing tag start", @source) end - attributes = {} + @document_status = :in_element prefixes = Set.new prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - if md[4].size > 0 - attrs = md[4].scan( ATTRIBUTE_PATTERN ) - raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 - attrs.each { |a,b,c,d,e| - if b == "xmlns" - if c == "xml" - if d != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) - end - elsif c == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) - end - curr_ns << c - elsif b - prefixes << b unless b == "xml" - end - attributes[a] = e - } - end - + attributes, closed = parse_attributes(prefixes, curr_ns) # Verify that all of the prefixes have been defined for prefix in prefixes unless @nsstack.find{|k| k.member?(prefix)} @@ -390,7 +419,7 @@ module REXML end end - if md[6] + if closed @closed = md[1] @nsstack.shift else @@ -419,6 +448,7 @@ module REXML end return [ :dummy ] end + private :pull_event def entity( reference, entities ) value = nil @@ -436,7 +466,7 @@ module REXML # Doing it like this rather than in a loop improves the speed copy.gsub!( EREFERENCE, '&' ) entities.each do |key, value| - copy.gsub!( value, "&#{key};" ) unless entity_filter and + copy.gsub!( value, "&#{key};" ) unless entity_filter and entity_filter.include?(entity) end if entities copy.gsub!( EREFERENCE, '&' ) @@ -452,7 +482,7 @@ module REXML rv.gsub!( /\r\n?/, "\n" ) matches = rv.scan( REFERENCE_RE ) return rv if matches.size == 0 - rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) { m=$1 m = "0#{m}" if m[0] == ?x [Integer(m)].pack('U*') @@ -465,19 +495,186 @@ module REXML if entity_value re = /&#{entity_reference};/ rv.gsub!( re, entity_value ) + else + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er end end end - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - er = DEFAULT_ENTITIES[entity_reference] - rv.gsub!( er[0], er[2] ) if er - end - end rv.gsub!( /&/, '&' ) end rv end + + private + def need_source_encoding_update?(xml_declaration_encoding) + return false if xml_declaration_encoding.nil? + return false if /\AUTF-16\z/i =~ xml_declaration_encoding + true + end + + def parse_name(base_error_message) + md = @source.match(/\A\s*#{NAME}/um, true) + unless md + if @source.match(/\A\s*\S/um) + message = "#{base_error_message}: invalid name" + else + message = "#{base_error_message}: name is missing" + end + raise REXML::ParseException.new(message, @source) + end + md[1] + end + + def parse_id(base_error_message, + accept_external_id:, + accept_public_id:) + if accept_external_id and (md = @source.match(EXTERNAL_ID_PUBLIC, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + system_literal = md[2] + system = system_literal[1..-2] if system_literal # Remove quote + ["PUBLIC", pubid, system] + elsif accept_public_id and (md = @source.match(PUBLIC_ID, true)) + pubid = system = nil + pubid_literal = md[1] + pubid = pubid_literal[1..-2] if pubid_literal # Remove quote + ["PUBLIC", pubid, nil] + elsif accept_external_id and (md = @source.match(EXTERNAL_ID_SYSTEM, true)) + system = nil + system_literal = md[1] + system = system_literal[1..-2] if system_literal # Remove quote + ["SYSTEM", nil, system] + else + details = parse_id_invalid_details(accept_external_id: accept_external_id, + accept_public_id: accept_public_id) + message = "#{base_error_message}: #{details}" + raise REXML::ParseException.new(message, @source) + end + end + + def parse_id_invalid_details(accept_external_id:, + accept_public_id:) + public = /\A\s*PUBLIC/um + system = /\A\s*SYSTEM/um + if (accept_external_id or accept_public_id) and @source.match(/#{public}/um) + if @source.match(/#{public}(?:\s+[^'"]|\s*[\[>])/um) + return "public ID literal is missing" + end + unless @source.match(/#{public}\s+#{PUBIDLITERAL}/um) + return "invalid public ID literal" + end + if accept_public_id + if @source.match(/#{public}\s+#{PUBIDLITERAL}\s+[^'"]/um) + return "system ID literal is missing" + end + unless @source.match(/#{public}\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}/um) + return "invalid system literal" + end + "garbage after system literal" + else + "garbage after public ID literal" + end + elsif accept_external_id and @source.match(/#{system}/um) + if @source.match(/#{system}(?:\s+[^'"]|\s*[\[>])/um) + return "system literal is missing" + end + unless @source.match(/#{system}\s+#{SYSTEMLITERAL}/um) + return "invalid system literal" + end + "garbage after system literal" + else + unless @source.match(/\A\s*(?:PUBLIC|SYSTEM)\s/um) + return "invalid ID type" + end + "ID type is missing" + end + end + + def parse_attributes(prefixes, curr_ns) + attributes = {} + closed = false + match_data = @source.match(/^(.*?)(\/)?>/um, true) + if match_data.nil? + message = "Start tag isn't ended" + raise REXML::ParseException.new(message, @source) + end + + raw_attributes = match_data[1] + closed = !match_data[2].nil? + return attributes, closed if raw_attributes.nil? + return attributes, closed if raw_attributes.empty? + + scanner = StringScanner.new(raw_attributes) + until scanner.eos? + if scanner.scan(/\s+/) + break if scanner.eos? + end + + pos = scanner.pos + loop do + break if scanner.scan(ATTRIBUTE_PATTERN) + unless scanner.scan(QNAME) + message = "Invalid attribute name: <#{scanner.rest}>" + raise REXML::ParseException.new(message, @source) + end + name = scanner[0] + unless scanner.scan(/\s*=\s*/um) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + quote = scanner.scan(/['"]/) + unless quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless scanner.scan(/.*#{Regexp.escape(quote)}/um) + match_data = @source.match(/^(.*?)(\/)?>/um, true) + if match_data + scanner << "/" if closed + scanner << ">" + scanner << match_data[1] + scanner.pos = pos + closed = !match_data[2].nil? + next + end + message = + "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + end + name = scanner[1] + prefix = scanner[2] + local_part = scanner[3] + # quote = scanner[4] + value = scanner[5] + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self) + end + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" + end + + if attributes.has_key?(name) + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end + + attributes[name] = value + end + return attributes, closed + end end end end diff --git a/lib/rexml/parsers/lightparser.rb b/lib/rexml/parsers/lightparser.rb index 0f35034993..f0601ae51b 100644 --- a/lib/rexml/parsers/lightparser.rb +++ b/lib/rexml/parsers/lightparser.rb @@ -1,14 +1,15 @@ +# frozen_string_literal: false require 'rexml/parsers/streamparser' require 'rexml/parsers/baseparser' require 'rexml/light/node' module REXML - module Parsers - class LightParser - def initialize stream - @stream = stream - @parser = REXML::Parsers::BaseParser.new( stream ) - end + module Parsers + class LightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end def add_listener( listener ) @parser.add_listener( listener ) @@ -19,42 +20,40 @@ module REXML @parser.stream = @stream end - def parse - root = context = [ :document ] - while true - event = @parser.pull - case event[0] - when :end_document - break - when :end_doctype - context = context[1] - when :start_element, :start_doctype - new_node = event - context << new_node - new_node[1,0] = [context] - context = new_node - when :end_element, :end_doctype - context = context[1] - else - new_node = event - context << new_node - new_node[1,0] = [context] - end - end - root - end - end + def parse + root = context = [ :document ] + while true + event = @parser.pull + case event[0] + when :end_document + break + when :start_element, :start_doctype + new_node = event + context << new_node + new_node[1,0] = [context] + context = new_node + when :end_element, :end_doctype + context = context[1] + else + new_node = event + context << new_node + new_node[1,0] = [context] + end + end + root + end + end - # An element is an array. The array contains: - # 0 The parent element - # 1 The tag name - # 2 A hash of attributes - # 3..-1 The child elements - # An element is an array of size > 3 - # Text is a String - # PIs are [ :processing_instruction, target, data ] - # Comments are [ :comment, data ] - # DocTypes are DocType structs - # The root is an array with XMLDecls, Text, DocType, Array, Text - end + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb index 36dc7160c3..8c49217553 100644 --- a/lib/rexml/parsers/pullparser.rb +++ b/lib/rexml/parsers/pullparser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'forwardable' require 'rexml/parseexception' @@ -68,7 +69,7 @@ module REXML event = @parser.pull case event[0] when :entitydecl - @entities[ event[1] ] = + @entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/ when :text unnormalized = @parser.unnormalize( event[1], @entities ) diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb index e402eb7747..1386f69c83 100644 --- a/lib/rexml/parsers/sax2parser.rb +++ b/lib/rexml/parsers/sax2parser.rb @@ -1,165 +1,175 @@ +# frozen_string_literal: false require 'rexml/parsers/baseparser' require 'rexml/parseexception' require 'rexml/namespace' require 'rexml/text' module REXML - module Parsers + module Parsers # SAX2Parser - class SAX2Parser - def initialize source - @parser = BaseParser.new(source) - @listeners = [] - @procs = [] - @namespace_stack = [] - @has_listeners = false - @tag_stack = [] + class SAX2Parser + def initialize source + @parser = BaseParser.new(source) + @listeners = [] + @procs = [] + @namespace_stack = [] + @has_listeners = false + @tag_stack = [] @entities = {} - end + end def source @parser.source end - + def add_listener( listener ) @parser.add_listener( listener ) end - # Listen arguments: - # - # Symbol, Array, Block - # Listen to Symbol events on Array elements - # Symbol, Block - # Listen to Symbol events - # Array, Listener - # Listen to all events on Array elements - # Array, Block - # Listen to :start_element events on Array elements - # Listener - # Listen to All events - # - # Symbol can be one of: :start_element, :end_element, - # :start_prefix_mapping, :end_prefix_mapping, :characters, - # :processing_instruction, :doctype, :attlistdecl, :elementdecl, - # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment + # Listen arguments: + # + # Symbol, Array, Block + # Listen to Symbol events on Array elements + # Symbol, Block + # Listen to Symbol events + # Array, Listener + # Listen to all events on Array elements + # Array, Block + # Listen to :start_element events on Array elements + # Listener + # Listen to All events + # + # Symbol can be one of: :start_element, :end_element, + # :start_prefix_mapping, :end_prefix_mapping, :characters, + # :processing_instruction, :doctype, :attlistdecl, :elementdecl, + # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment # # There is an additional symbol that can be listened for: :progress. - # This will be called for every event generated, passing in the current + # This will be called for every event generated, passing in the current # stream position. - # - # Array contains regular expressions or strings which will be matched - # against fully qualified element names. - # - # Listener must implement the methods in SAX2Listener - # - # Block will be passed the same arguments as a SAX2Listener method would - # be, where the method name is the same as the matched Symbol. - # See the SAX2Listener for more information. - def listen( *args, &blok ) - if args[0].kind_of? Symbol - if args.size == 2 - args[1].each { |match| @procs << [args[0], match, blok] } - else - add( [args[0], nil, blok] ) - end - elsif args[0].kind_of? Array - if args.size == 2 - args[0].each { |match| add( [nil, match, args[1]] ) } - else - args[0].each { |match| add( [ :start_element, match, blok ] ) } - end - else - add([nil, nil, args[0]]) - end - end - - def deafen( listener=nil, &blok ) - if listener - @listeners.delete_if {|item| item[-1] == listener } - @has_listeners = false if @listeners.size == 0 - else - @procs.delete_if {|item| item[-1] == blok } - end - end - - def parse - @procs.each { |sym,match,block| block.call if sym == :start_document } - @listeners.each { |sym,match,block| - block.start_document if sym == :start_document or sym.nil? - } - root = context = [] - while true - event = @parser.pull - case event[0] - when :end_document - handle( :end_document ) - break + # + # Array contains regular expressions or strings which will be matched + # against fully qualified element names. + # + # Listener must implement the methods in SAX2Listener + # + # Block will be passed the same arguments as a SAX2Listener method would + # be, where the method name is the same as the matched Symbol. + # See the SAX2Listener for more information. + def listen( *args, &blok ) + if args[0].kind_of? Symbol + if args.size == 2 + args[1].each { |match| @procs << [args[0], match, blok] } + else + add( [args[0], nil, blok] ) + end + elsif args[0].kind_of? Array + if args.size == 2 + args[0].each { |match| add( [nil, match, args[1]] ) } + else + args[0].each { |match| add( [ :start_element, match, blok ] ) } + end + else + add([nil, nil, args[0]]) + end + end + + def deafen( listener=nil, &blok ) + if listener + @listeners.delete_if {|item| item[-1] == listener } + @has_listeners = false if @listeners.size == 0 + else + @procs.delete_if {|item| item[-1] == blok } + end + end + + def parse + @procs.each { |sym,match,block| block.call if sym == :start_document } + @listeners.each { |sym,match,block| + block.start_document if sym == :start_document or sym.nil? + } + context = [] + while true + event = @parser.pull + case event[0] + when :end_document + handle( :end_document ) + break when :start_doctype handle( :doctype, *event[1..-1]) - when :end_doctype - context = context[1] - when :start_element - @tag_stack.push(event[1]) - # find the observers for namespaces - procs = get_procs( :start_prefix_mapping, event[1] ) - listeners = get_listeners( :start_prefix_mapping, event[1] ) - if procs or listeners - # break out the namespace declarations - # The attributes live in event[2] - event[2].each {|n, v| event[2][n] = @parser.normalize(v)} - nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ } - nsdecl.collect! { |n, value| [ n[6..-1], value ] } - @namespace_stack.push({}) - nsdecl.each do |n,v| - @namespace_stack[-1][n] = v - # notify observers of namespaces - procs.each { |ob| ob.call( n, v ) } if procs - listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners - end - end - event[1] =~ Namespace::NAMESPLIT - prefix = $1 - local = $2 - uri = get_namespace(prefix) - # find the observers for start_element - procs = get_procs( :start_element, event[1] ) - listeners = get_listeners( :start_element, event[1] ) - # notify observers - procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs - listeners.each { |ob| - ob.start_element( uri, local, event[1], event[2] ) - } if listeners - when :end_element - @tag_stack.pop - event[1] =~ Namespace::NAMESPLIT - prefix = $1 - local = $2 - uri = get_namespace(prefix) - # find the observers for start_element - procs = get_procs( :end_element, event[1] ) - listeners = get_listeners( :end_element, event[1] ) - # notify observers - procs.each { |ob| ob.call( uri, local, event[1] ) } if procs - listeners.each { |ob| - ob.end_element( uri, local, event[1] ) - } if listeners + when :end_doctype + context = context[1] + when :start_element + @tag_stack.push(event[1]) + # find the observers for namespaces + procs = get_procs( :start_prefix_mapping, event[1] ) + listeners = get_listeners( :start_prefix_mapping, event[1] ) + if procs or listeners + # break out the namespace declarations + # The attributes live in event[2] + event[2].each {|n, v| event[2][n] = @parser.normalize(v)} + nsdecl = event[2].find_all { |n, value| n =~ /^xmlns(:|$)/ } + nsdecl.collect! { |n, value| [ n[6..-1], value ] } + @namespace_stack.push({}) + nsdecl.each do |n,v| + @namespace_stack[-1][n] = v + # notify observers of namespaces + procs.each { |ob| ob.call( n, v ) } if procs + listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners + end + end + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :start_element, event[1] ) + listeners = get_listeners( :start_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs + listeners.each { |ob| + ob.start_element( uri, local, event[1], event[2] ) + } if listeners + when :end_element + @tag_stack.pop + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :end_element, event[1] ) + listeners = get_listeners( :end_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1] ) } if procs + listeners.each { |ob| + ob.end_element( uri, local, event[1] ) + } if listeners - namespace_mapping = @namespace_stack.pop - # find the observers for namespaces - procs = get_procs( :end_prefix_mapping, event[1] ) - listeners = get_listeners( :end_prefix_mapping, event[1] ) - if procs or listeners - namespace_mapping.each do |prefix, uri| - # notify observers of namespaces - procs.each { |ob| ob.call( prefix ) } if procs - listeners.each { |ob| ob.end_prefix_mapping(prefix) } if listeners - end - end - when :text + namespace_mapping = @namespace_stack.pop + # find the observers for namespaces + procs = get_procs( :end_prefix_mapping, event[1] ) + listeners = get_listeners( :end_prefix_mapping, event[1] ) + if procs or listeners + namespace_mapping.each do |ns_prefix, ns_uri| + # notify observers of namespaces + procs.each { |ob| ob.call( ns_prefix ) } if procs + listeners.each { |ob| ob.end_prefix_mapping(ns_prefix) } if listeners + end + end + when :text #normalized = @parser.normalize( event[1] ) #handle( :characters, normalized ) copy = event[1].clone - @entities.each { |key, value| copy = copy.gsub("&#{key};", value) } + + esub = proc { |match| + if @entities.has_key?($1) + @entities[$1].gsub(Text::REFERENCE, &esub) + else + match + end + } + + copy.gsub!( Text::REFERENCE, &esub ) copy.gsub!( Text::NUMERICENTITY ) {|m| m=$1 m = "0#{m}" if m[0] == ?x @@ -167,72 +177,97 @@ module REXML } handle( :characters, copy ) when :entitydecl - @entities[ event[1] ] = event[2] if event.size == 3 - handle( *event ) - when :processing_instruction, :comment, :attlistdecl, - :elementdecl, :cdata, :notationdecl, :xmldecl - handle( *event ) - end + handle_entitydecl( event ) + when :processing_instruction, :comment, :attlistdecl, + :elementdecl, :cdata, :notationdecl, :xmldecl + handle( *event ) + end handle( :progress, @parser.position ) - end - end + end + end - private - def handle( symbol, *arguments ) - tag = @tag_stack[-1] - procs = get_procs( symbol, tag ) - listeners = get_listeners( symbol, tag ) - # notify observers - procs.each { |ob| ob.call( *arguments ) } if procs - listeners.each { |l| - l.send( symbol.to_s, *arguments ) - } if listeners - end + private + def handle( symbol, *arguments ) + tag = @tag_stack[-1] + procs = get_procs( symbol, tag ) + listeners = get_listeners( symbol, tag ) + # notify observers + procs.each { |ob| ob.call( *arguments ) } if procs + listeners.each { |l| + l.send( symbol.to_s, *arguments ) + } if listeners + end - # The following methods are duplicates, but it is faster than using - # a helper - def get_procs( symbol, name ) - return nil if @procs.size == 0 - @procs.find_all do |sym, match, block| - #puts sym.inspect+"=="+symbol.inspect+ "\t"+match.inspect+"=="+name.inspect+ "\t"+( (sym.nil? or symbol == sym) and ((name.nil? and match.nil?) or match.nil? or ( (name == match) or (match.kind_of? Regexp and name =~ match)))).to_s - ( - (sym.nil? or symbol == sym) and - ((name.nil? and match.nil?) or match.nil? or ( - (name == match) or - (match.kind_of? Regexp and name =~ match) - ) - ) - ) - end.collect{|x| x[-1]} - end - def get_listeners( symbol, name ) - return nil if @listeners.size == 0 - @listeners.find_all do |sym, match, block| - ( - (sym.nil? or symbol == sym) and - ((name.nil? and match.nil?) or match.nil? or ( - (name == match) or - (match.kind_of? Regexp and name =~ match) - ) - ) - ) - end.collect{|x| x[-1]} - end + def handle_entitydecl( event ) + @entities[ event[1] ] = event[2] if event.size == 3 + parameter_reference_p = false + case event[2] + when "SYSTEM" + if event.size == 5 + if event.last == "%" + parameter_reference_p = true + else + event[4, 0] = "NDATA" + end + end + when "PUBLIC" + if event.size == 6 + if event.last == "%" + parameter_reference_p = true + else + event[5, 0] = "NDATA" + end + end + else + parameter_reference_p = (event.size == 4) + end + event[1, 0] = event.pop if parameter_reference_p + handle( event[0], event[1..-1] ) + end - def add( pair ) - if pair[-1].respond_to? :call - @procs << pair unless @procs.include? pair - else - @listeners << pair unless @listeners.include? pair - @has_listeners = true - end - end + # The following methods are duplicates, but it is faster than using + # a helper + def get_procs( symbol, name ) + return nil if @procs.size == 0 + @procs.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + ((name.nil? and match.nil?) or match.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end + def get_listeners( symbol, name ) + return nil if @listeners.size == 0 + @listeners.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + ((name.nil? and match.nil?) or match.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end - def get_namespace( prefix ) + def add( pair ) + if pair[-1].respond_to? :call + @procs << pair unless @procs.include? pair + else + @listeners << pair unless @listeners.include? pair + @has_listeners = true + end + end + + def get_namespace( prefix ) uris = (@namespace_stack.find_all { |ns| not ns[prefix].nil? }) || - (@namespace_stack.find { |ns| not ns[nil].nil? }) - uris[-1][prefix] unless uris.nil? or 0 == uris.size - end - end - end + (@namespace_stack.find { |ns| not ns[nil].nil? }) + uris[-1][prefix] unless uris.nil? or 0 == uris.size + end + end + end end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb index 256d0f611c..f6a8bfa802 100644 --- a/lib/rexml/parsers/streamparser.rb +++ b/lib/rexml/parsers/streamparser.rb @@ -1,29 +1,40 @@ +# frozen_string_literal: false +require "rexml/parsers/baseparser" + module REXML module Parsers class StreamParser def initialize source, listener @listener = listener @parser = BaseParser.new( source ) + @tag_stack = [] end - + def add_listener( listener ) @parser.add_listener( listener ) end - + def parse # entity string while true event = @parser.pull case event[0] when :end_document + unless @tag_stack.empty? + tag_path = "/" + @tag_stack.join("/") + raise ParseException.new("Missing end tag for '#{tag_path}'", + @parser.source) + end return when :start_element + @tag_stack << event[1] attrs = event[2].each do |n, v| event[2][n] = @parser.unnormalize( v ) end @listener.tag_start( event[1], attrs ) when :end_element @listener.tag_end( event[1] ) + @tag_stack.pop when :text normalized = @parser.unnormalize( event[1] ) @listener.text( normalized ) @@ -38,6 +49,10 @@ module REXML @listener.send( event[0].to_s, *event[1..-1] ) when :entitydecl, :notationdecl @listener.send( event[0].to_s, event[1..-1] ) + when :externalentity + entity_reference = event[1] + content = entity_reference.gsub(/\A%|;\z/, "") + @listener.entity(content) end end end diff --git a/lib/rexml/parsers/treeparser.rb b/lib/rexml/parsers/treeparser.rb index 5c3e142ea7..fc0993c72a 100644 --- a/lib/rexml/parsers/treeparser.rb +++ b/lib/rexml/parsers/treeparser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/validation/validationexception' require 'rexml/undefinednamespaceexception' @@ -24,13 +25,16 @@ module REXML case event[0] when :end_document unless tag_stack.empty? - #raise ParseException.new("No close tag for #{tag_stack.inspect}") - raise ParseException.new("No close tag for #{@build_context.xpath}") + raise ParseException.new("No close tag for #{@build_context.xpath}", + @parser.source, @parser) end return when :start_element tag_stack.push(event[1]) - el = @build_context = @build_context.add_element( event[1], event[2] ) + el = @build_context = @build_context.add_element( event[1] ) + event[2].each do |key, value| + el.attributes[key]=Attribute.new(key,value,self) + end when :end_element tag_stack.pop @build_context = @build_context.parent @@ -39,8 +43,8 @@ module REXML if @build_context[-1].instance_of? Text @build_context[-1] << event[1] else - @build_context.add( - Text.new(event[1], @build_context.whitespace, nil, true) + @build_context.add( + Text.new(event[1], @build_context.whitespace, nil, true) ) unless ( @build_context.ignore_whitespace_nodes and event[1].strip.size==0 @@ -86,7 +90,7 @@ module REXML end rescue REXML::Validation::ValidationException raise - rescue REXML::UndefinedNamespaceException + rescue REXML::ParseException raise rescue raise ParseException.new( $!.message, @parser.source, @parser, $! ) diff --git a/lib/rexml/parsers/ultralightparser.rb b/lib/rexml/parsers/ultralightparser.rb index adc4af18e2..6571d119bd 100644 --- a/lib/rexml/parsers/ultralightparser.rb +++ b/lib/rexml/parsers/ultralightparser.rb @@ -1,13 +1,14 @@ +# frozen_string_literal: false require 'rexml/parsers/streamparser' require 'rexml/parsers/baseparser' module REXML - module Parsers - class UltraLightParser - def initialize stream - @stream = stream - @parser = REXML::Parsers::BaseParser.new( stream ) - end + module Parsers + class UltraLightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end def add_listener( listener ) @parser.add_listener( listener ) @@ -18,39 +19,39 @@ module REXML @parser.stream = @stream end - def parse - root = context = [] - while true - event = @parser.pull - case event[0] - when :end_document - break - when :end_doctype - context = context[1] - when :start_element, :doctype - context << event - event[1,0] = [context] - context = event - when :end_element - context = context[1] - else - context << event - end - end - root - end - end + def parse + root = context = [] + while true + event = @parser.pull + case event[0] + when :end_document + break + when :end_doctype + context = context[1] + when :start_element, :start_doctype + context << event + event[1,0] = [context] + context = event + when :end_element + context = context[1] + else + context << event + end + end + root + end + end - # An element is an array. The array contains: - # 0 The parent element - # 1 The tag name - # 2 A hash of attributes - # 3..-1 The child elements - # An element is an array of size > 3 - # Text is a String - # PIs are [ :processing_instruction, target, data ] - # Comments are [ :comment, data ] - # DocTypes are DocType structs - # The root is an array with XMLDecls, Text, DocType, Array, Text - end + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end end diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb index de2530e347..32b70bb798 100644 --- a/lib/rexml/parsers/xpathparser.rb +++ b/lib/rexml/parsers/xpathparser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/namespace' require 'rexml/xmltokens' @@ -17,10 +18,11 @@ module REXML end def parse path + path = path.dup path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces - path.gsub!( /\s+([\]\)])/, '\1' ) + path.gsub!( /\s+([\]\)])/, '\1') parsed = [] - path = OrExpr(path, parsed) + OrExpr(path, parsed) parsed end @@ -39,10 +41,10 @@ module REXML case op when :node when :attribute - string << "/" if string.size > 0 - string << "@" + string << "/" if string.size > 0 + string << "@" when :child - string << "/" if string.size > 0 + string << "/" if string.size > 0 when :descendant_or_self string << "/" when :self @@ -51,10 +53,10 @@ module REXML string << ".." when :any string << "*" - when :text - string << "text()" - when :following, :following_sibling, - :ancestor, :ancestor_or_self, :descendant, + when :text + string << "text()" + when :following, :following_sibling, + :ancestor, :ancestor_or_self, :descendant, :namespace, :preceding, :preceding_sibling string << "/" unless string.size == 0 string << op.to_s.tr("_", "-") @@ -70,13 +72,13 @@ module REXML string << ']' when :document document = true - when :function - string << path.shift - string << "( " - string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} - string << " )" - when :literal - string << %Q{ "#{path.shift}" } + when :function + string << path.shift + string << "( " + string << predicate_to_string( path.shift[0] ) {|x| abbreviate( x )} + string << " )" + when :literal + string << %Q{ "#{path.shift}" } else string << "/" unless string.size == 0 string << "UNKNOWN(" @@ -84,7 +86,7 @@ module REXML string << ")" end end - string = "/"+string if document + string = "/"+string if document return string end @@ -97,7 +99,7 @@ module REXML case op when :node string << "node()" - when :attribute, :child, :following, :following_sibling, + when :attribute, :child, :following, :following_sibling, :ancestor, :ancestor_or_self, :descendant, :descendant_or_self, :namespace, :preceding, :preceding_sibling, :self, :parent string << "/" unless string.size == 0 @@ -183,7 +185,6 @@ module REXML # | '/' RelativeLocationPath? # | '//' RelativeLocationPath def LocationPath path, parsed - #puts "LocationPath '#{path}'" path = path.strip if path[0] == ?/ parsed << :document @@ -195,7 +196,6 @@ module REXML path = path[1..-1] end end - #puts parsed.inspect return RelativeLocationPath( path, parsed ) if path.size > 0 end @@ -209,7 +209,6 @@ module REXML # | RelativeLocationPath '//' Step AXIS = /^(ancestor|ancestor-or-self|attribute|child|descendant|descendant-or-self|following|following-sibling|namespace|parent|preceding|preceding-sibling|self)::/ def RelativeLocationPath path, parsed - #puts "RelativeLocationPath #{path}" while path.size > 0 # (axis or @ or <child::>) nodetest predicate > # OR > / Step @@ -226,7 +225,6 @@ module REXML end else if path[0] == ?@ - #puts "ATTRIBUTE" parsed << :attribute path = path[1..-1] # Goto Nodetest @@ -238,10 +236,8 @@ module REXML parsed << :child end - #puts "NODETESTING '#{path}'" n = [] path = NodeTest( path, n) - #puts "NODETEST RETURNED '#{path}'" if path[0] == ?[ path = Predicate( path, n ) @@ -249,7 +245,7 @@ module REXML parsed.concat(n) end - + if path.size > 0 if path[0] == ?/ if path[1] == ?/ @@ -281,8 +277,6 @@ module REXML NODE_TYPE = /^(comment|text|node)\(\s*\)/m PI = /^processing-instruction\(/ def NodeTest path, parsed - #puts "NodeTest with #{path}" - res = nil case path when /^\*/ path = $' @@ -304,13 +298,11 @@ module REXML parsed << :processing_instruction parsed << (literal || '') when NCNAMETEST - #puts "NCNAMETEST" prefix = $1 path = $' parsed << :namespace parsed << prefix when QNAME - #puts "QNAME" prefix = $1 name = $2 path = $' @@ -324,22 +316,18 @@ module REXML # Filters the supplied nodeset on the predicate(s) def Predicate path, parsed - #puts "PREDICATE with #{path}" return nil unless path[0] == ?[ predicates = [] while path[0] == ?[ path, expr = get_group(path) predicates << expr[1..-2] if expr end - #puts "PREDICATES = #{predicates.inspect}" - predicates.each{ |expr| - #puts "ORING #{expr}" + predicates.each{ |pred| preds = [] parsed << :predicate parsed << preds - OrExpr(expr, preds) + OrExpr(pred, preds) } - #puts "PREDICATES = #{predicates.inspect}" path end @@ -350,10 +338,8 @@ module REXML #| OrExpr S 'or' S AndExpr #| AndExpr def OrExpr path, parsed - #puts "OR >>> #{path}" n = [] rest = AndExpr( path, n ) - #puts "OR <<< #{rest}" if rest != path while rest =~ /^\s*( or )/ n = [ :or, n, [] ] @@ -371,16 +357,12 @@ module REXML #| AndExpr S 'and' S EqualityExpr #| EqualityExpr def AndExpr path, parsed - #puts "AND >>> #{path}" n = [] rest = EqualityExpr( path, n ) - #puts "AND <<< #{rest}" if rest != path while rest =~ /^\s*( and )/ n = [ :and, n, [] ] - #puts "AND >>> #{rest}" rest = EqualityExpr( $', n[-1] ) - #puts "AND <<< #{rest}" end end if parsed.size == 0 and n.size != 0 @@ -394,10 +376,8 @@ module REXML #| EqualityExpr ('=' | '!=') RelationalExpr #| RelationalExpr def EqualityExpr path, parsed - #puts "EQUALITY >>> #{path}" n = [] rest = RelationalExpr( path, n ) - #puts "EQUALITY <<< #{rest}" if rest != path while rest =~ /^\s*(!?=)\s*/ if $1[0] == ?! @@ -419,10 +399,8 @@ module REXML #| RelationalExpr ('<' | '>' | '<=' | '>=') AdditiveExpr #| AdditiveExpr def RelationalExpr path, parsed - #puts "RELATION >>> #{path}" n = [] rest = AdditiveExpr( path, n ) - #puts "RELATION <<< #{rest}" if rest != path while rest =~ /^\s*([<>]=?)\s*/ if $1[0] == ?< @@ -446,10 +424,8 @@ module REXML #| AdditiveExpr ('+' | S '-') MultiplicativeExpr #| MultiplicativeExpr def AdditiveExpr path, parsed - #puts "ADDITIVE >>> #{path}" n = [] rest = MultiplicativeExpr( path, n ) - #puts "ADDITIVE <<< #{rest}" if rest != path while rest =~ /^\s*(\+| -)\s*/ if $1[0] == ?+ @@ -471,10 +447,8 @@ module REXML #| MultiplicativeExpr ('*' | S ('div' | 'mod') S) UnaryExpr #| UnaryExpr def MultiplicativeExpr path, parsed - #puts "MULT >>> #{path}" n = [] rest = UnaryExpr( path, n ) - #puts "MULT <<< #{rest}" if rest != path while rest =~ /^\s*(\*| div | mod )\s*/ if $1[0] == ?* @@ -507,10 +481,8 @@ module REXML end parsed << :neg if mult < 0 - #puts "UNARY >>> #{path}" n = [] path = UnionExpr( path, n ) - #puts "UNARY <<< #{path}" parsed.concat( n ) path end @@ -518,10 +490,8 @@ module REXML #| UnionExpr '|' PathExpr #| PathExpr def UnionExpr path, parsed - #puts "UNION >>> #{path}" n = [] rest = PathExpr( path, n ) - #puts "UNION <<< #{rest}" if rest != path while rest =~ /^\s*(\|)\s*/ n = [ :union, n, [] ] @@ -541,17 +511,14 @@ module REXML def PathExpr path, parsed path =~ /^\s*/ path = $' - #puts "PATH >>> #{path}" n = [] rest = FilterExpr( path, n ) - #puts "PATH <<< '#{rest}'" if rest != path if rest and rest[0] == ?/ return RelativeLocationPath(rest, n) end end - #puts "BEFORE WITH '#{rest}'" - rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w_*]/ + rest = LocationPath(rest, n) if rest =~ /\A[\/\.\@\[\w*]/ parsed.concat(n) return rest end @@ -559,12 +526,9 @@ module REXML #| FilterExpr Predicate #| PrimaryExpr def FilterExpr path, parsed - #puts "FILTER >>> #{path}" n = [] path = PrimaryExpr( path, n ) - #puts "FILTER <<< #{path}" path = Predicate(path, n) if path and path[0] == ?[ - #puts "FILTER <<< #{path}" parsed.concat(n) path end @@ -578,7 +542,6 @@ module REXML NUMBER = /^(\d*\.?\d+)/ NT = /^comment|text|processing-instruction|node$/ def PrimaryExpr path, parsed - arry = [] case path when VARIABLE_REFERENCE varname = $1 @@ -587,26 +550,22 @@ module REXML parsed << varname #arry << @variables[ varname ] when /^(\w[-\w]*)(?:\()/ - #puts "PrimaryExpr :: Function >>> #$1 -- '#$''" fname = $1 tmp = $' - #puts "#{fname} =~ #{NT.inspect}" return path if fname =~ NT path = tmp parsed << :function parsed << fname path = FunctionCall(path, parsed) when NUMBER - #puts "LITERAL or NUMBER: #$1" varname = $1.nil? ? $2 : $1 path = $' - parsed << :literal + parsed << :literal parsed << (varname.include?('.') ? varname.to_f : varname.to_i) when LITERAL - #puts "LITERAL or NUMBER: #$1" varname = $1.nil? ? $2 : $1 path = $' - parsed << :literal + parsed << :literal parsed << varname when /^\(/ #/ path, contents = get_group(path) @@ -649,43 +608,43 @@ module REXML return nil unless depth==0 [string[ind..-1], string[0..ind-1]] end - + def parse_args( string ) arguments = [] ind = 0 - inquot = false - inapos = false + inquot = false + inapos = false depth = 1 begin case string[ind] when ?" - inquot = !inquot unless inapos + inquot = !inquot unless inapos when ?' - inapos = !inapos unless inquot + inapos = !inapos unless inquot else - unless inquot or inapos - case string[ind] - when ?( - depth += 1 + unless inquot or inapos + case string[ind] + when ?( + depth += 1 if depth == 1 - string = string[1..-1] - ind -= 1 + string = string[1..-1] + ind -= 1 + end + when ?) + depth -= 1 + if depth == 0 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] end - when ?) - depth -= 1 - if depth == 0 - s = string[0,ind].strip - arguments << s unless s == "" - string = string[ind+1..-1] - end - when ?, - if depth == 1 - s = string[0,ind].strip - arguments << s unless s == "" - string = string[ind+1..-1] - ind = -1 - end - end + when ?, + if depth == 1 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] + ind = -1 + end + end end end ind += 1 diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb index c099db8579..5d6c77ca38 100644 --- a/lib/rexml/quickpath.rb +++ b/lib/rexml/quickpath.rb @@ -1,266 +1,266 @@ +# frozen_string_literal: false require 'rexml/functions' require 'rexml/xmltokens' module REXML - class QuickPath - include Functions - include XMLTokens + class QuickPath + include Functions + include XMLTokens - EMPTY_HASH = {} + # A base Hash object to be used when initializing a + # default empty namespaces set. + EMPTY_HASH = {} - def QuickPath::first element, path, namespaces=EMPTY_HASH - match(element, path, namespaces)[0] - end + def QuickPath::first element, path, namespaces=EMPTY_HASH + match(element, path, namespaces)[0] + end - def QuickPath::each element, path, namespaces=EMPTY_HASH, &block - path = "*" unless path - match(element, path, namespaces).each( &block ) - end + def QuickPath::each element, path, namespaces=EMPTY_HASH, &block + path = "*" unless path + match(element, path, namespaces).each( &block ) + end - def QuickPath::match element, path, namespaces=EMPTY_HASH - raise "nil is not a valid xpath" unless path - results = nil - Functions::namespace_context = namespaces - case path - when /^\/([^\/]|$)/u - # match on root - path = path[1..-1] - return [element.root.parent] if path == '' - results = filter([element.root], path) - when /^[-\w]*::/u - results = filter([element], path) - when /^\*/u - results = filter(element.to_a, path) - when /^[\[!\w:]/u - # match on child - matches = [] - children = element.to_a - results = filter(children, path) - else - results = filter([element], path) - end - return results - end + def QuickPath::match element, path, namespaces=EMPTY_HASH + raise "nil is not a valid xpath" unless path + results = nil + Functions::namespace_context = namespaces + case path + when /^\/([^\/]|$)/u + # match on root + path = path[1..-1] + return [element.root.parent] if path == '' + results = filter([element.root], path) + when /^[-\w]*::/u + results = filter([element], path) + when /^\*/u + results = filter(element.to_a, path) + when /^[\[!\w:]/u + # match on child + children = element.to_a + results = filter(children, path) + else + results = filter([element], path) + end + return results + end - # Given an array of nodes it filters the array based on the path. The - # result is that when this method returns, the array will contain elements - # which match the path - def QuickPath::filter elements, path - return elements if path.nil? or path == '' or elements.size == 0 - case path - when /^\/\//u # Descendant - return axe( elements, "descendant-or-self", $' ) - when /^\/?\b(\w[-\w]*)\b::/u # Axe - axe_name = $1 - rest = $' - return axe( elements, $1, $' ) - when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child - rest = $' - results = [] - elements.each do |element| - results |= filter( element.to_a, rest ) - end - return results - when /^\/?(\w[-\w]*)\(/u # / Function - return function( elements, $1, $' ) - when Namespace::NAMESPLIT # Element name - name = $2 - ns = $1 - rest = $' - elements.delete_if do |element| - !(element.kind_of? Element and - (element.expanded_name == name or - (element.name == name and - element.namespace == Functions.namespace_context[ns]))) - end - return filter( elements, rest ) - when /^\/\[/u - matches = [] - elements.each do |element| - matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element - end - return matches - when /^\[/u # Predicate - return predicate( elements, path ) - when /^\/?\.\.\./u # Ancestor - return axe( elements, "ancestor", $' ) - when /^\/?\.\./u # Parent - return filter( elements.collect{|e|e.parent}, $' ) - when /^\/?\./u # Self - return filter( elements, $' ) - when /^\*/u # Any - results = [] - elements.each do |element| - results |= filter( [element], $' ) if element.kind_of? Element - #if element.kind_of? Element - # children = element.to_a - # children.delete_if { |child| !child.kind_of?(Element) } - # results |= filter( children, $' ) - #end - end - return results - end - return [] - end + # Given an array of nodes it filters the array based on the path. The + # result is that when this method returns, the array will contain elements + # which match the path + def QuickPath::filter elements, path + return elements if path.nil? or path == '' or elements.size == 0 + case path + when /^\/\//u # Descendant + return axe( elements, "descendant-or-self", $' ) + when /^\/?\b(\w[-\w]*)\b::/u # Axe + return axe( elements, $1, $' ) + when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child + rest = $' + results = [] + elements.each do |element| + results |= filter( element.to_a, rest ) + end + return results + when /^\/?(\w[-\w]*)\(/u # / Function + return function( elements, $1, $' ) + when Namespace::NAMESPLIT # Element name + name = $2 + ns = $1 + rest = $' + elements.delete_if do |element| + !(element.kind_of? Element and + (element.expanded_name == name or + (element.name == name and + element.namespace == Functions.namespace_context[ns]))) + end + return filter( elements, rest ) + when /^\/\[/u + matches = [] + elements.each do |element| + matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element + end + return matches + when /^\[/u # Predicate + return predicate( elements, path ) + when /^\/?\.\.\./u # Ancestor + return axe( elements, "ancestor", $' ) + when /^\/?\.\./u # Parent + return filter( elements.collect{|e|e.parent}, $' ) + when /^\/?\./u # Self + return filter( elements, $' ) + when /^\*/u # Any + results = [] + elements.each do |element| + results |= filter( [element], $' ) if element.kind_of? Element + #if element.kind_of? Element + # children = element.to_a + # children.delete_if { |child| !child.kind_of?(Element) } + # results |= filter( children, $' ) + #end + end + return results + end + return [] + end - def QuickPath::axe( elements, axe_name, rest ) - matches = [] - matches = filter( elements.dup, rest ) if axe_name =~ /-or-self$/u - case axe_name - when /^descendant/u - elements.each do |element| - matches |= filter( element.to_a, "descendant-or-self::#{rest}" ) if element.kind_of? Element - end - when /^ancestor/u - elements.each do |element| - while element.parent - matches << element.parent - element = element.parent - end - end - matches = filter( matches, rest ) - when "self" - matches = filter( elements, rest ) - when "child" - elements.each do |element| - matches |= filter( element.to_a, rest ) if element.kind_of? Element - end - when "attribute" - elements.each do |element| - matches << element.attributes[ rest ] if element.kind_of? Element - end - when "parent" - matches = filter(elements.collect{|element| element.parent}.uniq, rest) - when "following-sibling" - matches = filter(elements.collect{|element| element.next_sibling}.uniq, - rest) - when "previous-sibling" - matches = filter(elements.collect{|element| - element.previous_sibling}.uniq, rest ) - end - return matches.uniq - end + def QuickPath::axe( elements, axe_name, rest ) + matches = [] + matches = filter( elements.dup, rest ) if axe_name =~ /-or-self$/u + case axe_name + when /^descendant/u + elements.each do |element| + matches |= filter( element.to_a, "descendant-or-self::#{rest}" ) if element.kind_of? Element + end + when /^ancestor/u + elements.each do |element| + while element.parent + matches << element.parent + element = element.parent + end + end + matches = filter( matches, rest ) + when "self" + matches = filter( elements, rest ) + when "child" + elements.each do |element| + matches |= filter( element.to_a, rest ) if element.kind_of? Element + end + when "attribute" + elements.each do |element| + matches << element.attributes[ rest ] if element.kind_of? Element + end + when "parent" + matches = filter(elements.collect{|element| element.parent}.uniq, rest) + when "following-sibling" + matches = filter(elements.collect{|element| element.next_sibling}.uniq, + rest) + when "previous-sibling" + matches = filter(elements.collect{|element| + element.previous_sibling}.uniq, rest ) + end + return matches.uniq + end - # A predicate filters a node-set with respect to an axis to produce a - # new node-set. For each node in the node-set to be filtered, the - # PredicateExpr is evaluated with that node as the context node, with - # the number of nodes in the node-set as the context size, and with the - # proximity position of the node in the node-set with respect to the - # axis as the context position; if PredicateExpr evaluates to true for - # that node, the node is included in the new node-set; otherwise, it is - # not included. - # - # A PredicateExpr is evaluated by evaluating the Expr and converting - # the result to a boolean. If the result is a number, the result will - # be converted to true if the number is equal to the context position - # and will be converted to false otherwise; if the result is not a - # number, then the result will be converted as if by a call to the - # boolean function. Thus a location path para[3] is equivalent to - # para[position()=3]. - def QuickPath::predicate( elements, path ) - ind = 1 - bcount = 1 - while bcount > 0 - bcount += 1 if path[ind] == ?[ - bcount -= 1 if path[ind] == ?] - ind += 1 - end - ind -= 1 - predicate = path[1..ind-1] - rest = path[ind+1..-1] + OPERAND_ = '((?=(?:(?!and|or).)*[^\s<>=])[^\s<>=]+)' + # A predicate filters a node-set with respect to an axis to produce a + # new node-set. For each node in the node-set to be filtered, the + # PredicateExpr is evaluated with that node as the context node, with + # the number of nodes in the node-set as the context size, and with the + # proximity position of the node in the node-set with respect to the + # axis as the context position; if PredicateExpr evaluates to true for + # that node, the node is included in the new node-set; otherwise, it is + # not included. + # + # A PredicateExpr is evaluated by evaluating the Expr and converting + # the result to a boolean. If the result is a number, the result will + # be converted to true if the number is equal to the context position + # and will be converted to false otherwise; if the result is not a + # number, then the result will be converted as if by a call to the + # boolean function. Thus a location path para[3] is equivalent to + # para[position()=3]. + def QuickPath::predicate( elements, path ) + ind = 1 + bcount = 1 + while bcount > 0 + bcount += 1 if path[ind] == ?[ + bcount -= 1 if path[ind] == ?] + ind += 1 + end + ind -= 1 + predicate = path[1..ind-1] + rest = path[ind+1..-1] - # have to change 'a [=<>] b [=<>] c' into 'a [=<>] b and b [=<>] c' - predicate.gsub!( /([^\s(and)(or)<>=]+)\s*([<>=])\s*([^\s(and)(or)<>=]+)\s*([<>=])\s*([^\s(and)(or)<>=]+)/u ) { - "#$1 #$2 #$3 and #$3 #$4 #$5" - } - # Let's do some Ruby trickery to avoid some work: - predicate.gsub!( /&/u, "&&" ) - predicate.gsub!( /=/u, "==" ) - predicate.gsub!( /@(\w[-\w.]*)/u ) { - "attribute(\"#$1\")" - } - predicate.gsub!( /\bmod\b/u, "%" ) - predicate.gsub!( /\b(\w[-\w.]*\()/u ) { - fname = $1 - fname.gsub( /-/u, "_" ) - } - - Functions.pair = [ 0, elements.size ] - results = [] - elements.each do |element| - Functions.pair[0] += 1 - Functions.node = element - res = eval( predicate ) - case res - when true - results << element - when Fixnum - results << element if Functions.pair[0] == res - when String - results << element - end - end - return filter( results, rest ) - end + # have to change 'a [=<>] b [=<>] c' into 'a [=<>] b and b [=<>] c' + # + predicate.gsub!( + /#{OPERAND_}\s*([<>=])\s*#{OPERAND_}\s*([<>=])\s*#{OPERAND_}/u, + '\1 \2 \3 and \3 \4 \5' ) + # Let's do some Ruby trickery to avoid some work: + predicate.gsub!( /&/u, "&&" ) + predicate.gsub!( /=/u, "==" ) + predicate.gsub!( /@(\w[-\w.]*)/u, 'attribute("\1")' ) + predicate.gsub!( /\bmod\b/u, "%" ) + predicate.gsub!( /\b(\w[-\w.]*\()/u ) { + fname = $1 + fname.gsub( /-/u, "_" ) + } - def QuickPath::attribute( name ) - return Functions.node.attributes[name] if Functions.node.kind_of? Element - end + Functions.pair = [ 0, elements.size ] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = eval( predicate ) + case res + when true + results << element + when Integer + results << element if Functions.pair[0] == res + when String + results << element + end + end + return filter( results, rest ) + end - def QuickPath::name() - return Functions.node.name if Functions.node.kind_of? Element - end + def QuickPath::attribute( name ) + return Functions.node.attributes[name] if Functions.node.kind_of? Element + end - def QuickPath::method_missing( id, *args ) - begin - Functions.send( id.id2name, *args ) - rescue Exception - raise "METHOD: #{id.id2name}(#{args.join ', '})\n#{$!.message}" - end - end + def QuickPath::name() + return Functions.node.name if Functions.node.kind_of? Element + end - def QuickPath::function( elements, fname, rest ) - args = parse_args( elements, rest ) - Functions.pair = [0, elements.size] - results = [] - elements.each do |element| - Functions.pair[0] += 1 - Functions.node = element - res = Functions.send( fname, *args ) - case res - when true - results << element - when Fixnum - results << element if Functions.pair[0] == res - end - end - return results - end + def QuickPath::method_missing( id, *args ) + begin + Functions.send( id.id2name, *args ) + rescue Exception + raise "METHOD: #{id.id2name}(#{args.join ', '})\n#{$!.message}" + end + end - def QuickPath::parse_args( element, string ) - # /.*?(?:\)|,)/ - arguments = [] - buffer = "" - while string and string != "" - c = string[0] - string.sub!(/^./u, "") - case c - when ?, - # if depth = 1, then we start a new argument - arguments << evaluate( buffer ) - #arguments << evaluate( string[0..count] ) - when ?( - # start a new method call - function( element, buffer, string ) - buffer = "" - when ?) - # close the method call and return arguments - return arguments - else - buffer << c - end - end - "" - end - end + def QuickPath::function( elements, fname, rest ) + args = parse_args( elements, rest ) + Functions.pair = [0, elements.size] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = Functions.send( fname, *args ) + case res + when true + results << element + when Integer + results << element if Functions.pair[0] == res + end + end + return results + end + + def QuickPath::parse_args( element, string ) + # /.*?(?:\)|,)/ + arguments = [] + buffer = "" + while string and string != "" + c = string[0] + string.sub!(/^./u, "") + case c + when ?, + # if depth = 1, then we start a new argument + arguments << evaluate( buffer ) + #arguments << evaluate( string[0..count] ) + when ?( + # start a new method call + function( element, buffer, string ) + buffer = "" + when ?) + # close the method call and return arguments + return arguments + else + buffer << c + end + end + "" + end + end end diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb index 8af1697e51..652d6429af 100644 --- a/lib/rexml/rexml.rb +++ b/lib/rexml/rexml.rb @@ -1,31 +1,31 @@ # -*- encoding: utf-8 -*- +# frozen_string_literal: false # REXML is an XML toolkit for Ruby[http://www.ruby-lang.org], in Ruby. # # REXML is a _pure_ Ruby, XML 1.0 conforming, # non-validating[http://www.w3.org/TR/2004/REC-xml-20040204/#sec-conformance] # toolkit with an intuitive API. REXML passes 100% of the non-validating Oasis # tests[http://www.oasis-open.org/committees/xml-conformance/xml-test-suite.shtml], -# and provides tree, stream, SAX2, pull, and lightweight APIs. REXML also -# includes a full XPath[http://www.w3c.org/tr/xpath] 1.0 implementation. Since +# and provides tree, stream, SAX2, pull, and lightweight APIs. REXML also +# includes a full XPath[http://www.w3c.org/tr/xpath] 1.0 implementation. Since # Ruby 1.8, REXML is included in the standard Ruby distribution. # # Main page:: http://www.germane-software.com/software/rexml # Author:: Sean Russell <serATgermaneHYPHENsoftwareDOTcom> -# Version:: 3.1.7.2 -# Date:: 2007/275 -# Revision:: $Revision$ -# +# Date:: 2008/019 +# Version:: 3.1.7.3 +# # This API documentation can be downloaded from the REXML home page, or can # be accessed online[http://www.germane-software.com/software/rexml_doc] # # A tutorial is available in the REXML distribution in docs/tutorial.html, -# or can be accessed +# or can be accessed # online[http://www.germane-software.com/software/rexml/docs/tutorial.html] module REXML - COPYRIGHT = "Copyright \xC2\xA9 2001-2006 Sean Russell <ser@germane-software.com>" - VERSION = "3.1.7.2" - DATE = "2007/275" - REVISION = "$Revision$".gsub(/\$Revision:|\$/,'').strip + COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>" + DATE = "2008/019" + VERSION = "3.1.7.4" + REVISION = %w$Revision$[1] || '' Copyright = COPYRIGHT Version = VERSION diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb index 8db1389d06..5afdc80890 100644 --- a/lib/rexml/sax2listener.rb +++ b/lib/rexml/sax2listener.rb @@ -1,97 +1,98 @@ +# frozen_string_literal: false module REXML - # A template for stream parser listeners. - # Note that the declarations (attlistdecl, elementdecl, etc) are trivially - # processed; REXML doesn't yet handle doctype entity declarations, so you - # have to parse them out yourself. - # === Missing methods from SAX2 - # ignorable_whitespace - # === Methods extending SAX2 - # +WARNING+ - # These methods are certainly going to change, until DTDs are fully - # supported. Be aware of this. - # start_document - # end_document - # doctype - # elementdecl - # attlistdecl - # entitydecl - # notationdecl - # cdata - # xmldecl - # comment - module SAX2Listener - def start_document - end - def end_document - end - def start_prefix_mapping prefix, uri - end - def end_prefix_mapping prefix - end - def start_element uri, localname, qname, attributes - end - def end_element uri, localname, qname - end - def characters text - end - def processing_instruction target, data - end - # Handles a doctype declaration. Any attributes of the doctype which are - # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> - # @p name the name of the doctype; EG, "me" - # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" - # @p long_name the supplied long name, or nil. EG, "foo" - # @p uri the uri of the doctype, or nil. EG, "bar" - def doctype name, pub_sys, long_name, uri - end - # If a doctype includes an ATTLIST declaration, it will cause this - # method to be called. The content is the declaration itself, unparsed. - # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el - # attr CDATA #REQUIRED". This is the same for all of the .*decl - # methods. - def attlistdecl(element, pairs, contents) - end - # <!ELEMENT ...> - def elementdecl content - end - # <!ENTITY ...> - # The argument passed to this method is an array of the entity - # declaration. It can be in a number of formats, but in general it - # returns (example, result): - # <!ENTITY % YN '"Yes"'> - # ["%", "YN", "'\"Yes\"'", "\""] - # <!ENTITY % YN 'Yes'> - # ["%", "YN", "'Yes'", "s"] - # <!ENTITY WhatHeSaid "He said %YN;"> - # ["WhatHeSaid", "\"He said %YN;\"", "YN"] - # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> - # ["open-hatch", "SYSTEM", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] - # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> - # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] - # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> - # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl name, decl - end - # <!NOTATION ...> - def notationdecl content - end - # Called when <![CDATA[ ... ]]> is encountered in a document. - # @p content "..." - def cdata content - end - # Called when an XML PI is encountered in the document. - # EG: <?xml version="1.0" encoding="utf"?> - # @p version the version attribute value. EG, "1.0" - # @p encoding the encoding attribute value, or nil. EG, "utf" - # @p standalone the standalone attribute value, or nil. EG, nil + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + # === Missing methods from SAX2 + # ignorable_whitespace + # === Methods extending SAX2 + # +WARNING+ + # These methods are certainly going to change, until DTDs are fully + # supported. Be aware of this. + # start_document + # end_document + # doctype + # elementdecl + # attlistdecl + # entitydecl + # notationdecl + # cdata + # xmldecl + # comment + module SAX2Listener + def start_document + end + def end_document + end + def start_prefix_mapping prefix, uri + end + def end_prefix_mapping prefix + end + def start_element uri, localname, qname, attributes + end + def end_element uri, localname, qname + end + def characters text + end + def processing_instruction target, data + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl(element, pairs, contents) + end + # <!ELEMENT ...> + def elementdecl content + end + # <!ENTITY ...> + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # <!ENTITY % YN '"Yes"'> + # ["%", "YN", "\"Yes\""] + # <!ENTITY % YN 'Yes'> + # ["%", "YN", "Yes"] + # <!ENTITY WhatHeSaid "He said %YN;"> + # ["WhatHeSaid", "He said %YN;"] + # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "SYSTEM", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "PUBLIC", "-//Textuality//TEXT Standard open-hatch boilerplate//EN", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> + # ["hatch-pic", "SYSTEM", "../grafix/OpenHatch.gif", "NDATA", "gif"] + def entitydecl declaration + end + # <!NOTATION ...> + def notationdecl name, public_or_system, public_id, system_id + end + # Called when <![CDATA[ ... ]]> is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: <?xml version="1.0" encoding="utf"?> + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil # @p spaced the declaration is followed by a line break - def xmldecl version, encoding, standalone - end - # Called when a comment is encountered. - # @p comment The content of the comment - def comment comment - end + def xmldecl version, encoding, standalone + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end def progress position end - end + end end diff --git a/lib/rexml/security.rb b/lib/rexml/security.rb new file mode 100644 index 0000000000..99b7460772 --- /dev/null +++ b/lib/rexml/security.rb @@ -0,0 +1,28 @@ +# frozen_string_literal: false +module REXML + module Security + @@entity_expansion_limit = 10_000 + + # Set the entity expansion limit. By default the limit is set to 10000. + def self.entity_expansion_limit=( val ) + @@entity_expansion_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10000. + def self.entity_expansion_limit + return @@entity_expansion_limit + end + + @@entity_expansion_text_limit = 10_240 + + # Set the entity expansion limit. By default the limit is set to 10240. + def self.entity_expansion_text_limit=( val ) + @@entity_expansion_text_limit = val + end + + # Get the entity expansion limit. By default the limit is set to 10240. + def self.entity_expansion_text_limit + return @@entity_expansion_text_limit + end + end +end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb index ce7a2c98b0..af65cf4751 100644 --- a/lib/rexml/source.rb +++ b/lib/rexml/source.rb @@ -1,3 +1,5 @@ +# coding: US-ASCII +# frozen_string_literal: false require 'rexml/encoding' module REXML @@ -7,13 +9,14 @@ module REXML # @param arg Either a String, or an IO # @return a Source, or nil if a bad argument was given def SourceFactory::create_from(arg) - if arg.kind_of? String - Source.new(arg) - elsif arg.respond_to? :read and - arg.respond_to? :readline and - arg.respond_to? :nil? and - arg.respond_to? :eof? + if arg.respond_to? :read and + arg.respond_to? :readline and + arg.respond_to? :nil? and + arg.respond_to? :eof? IOSource.new(arg) + elsif arg.respond_to? :to_str + require 'stringio' + IOSource.new(StringIO.new(arg)) elsif arg.kind_of? Source arg else @@ -42,7 +45,7 @@ module REXML if encoding self.encoding = encoding else - self.encoding = check_encoding( @buffer ) + detect_encoding end @line = 0 end @@ -52,22 +55,16 @@ module REXML # Overridden to support optimized en/decoding def encoding=(enc) return unless super - @line_break = encode( '>' ) - if enc != UTF_8 - @buffer = decode(@buffer) - @to_utf = true - else - @to_utf = false - end + encoding_updated end # Scans the source for a given pattern. Note, that this is not your # usual scan() method. For one thing, the pattern argument has some # requirements; for another, the source can be consumed. You can easily # confuse this method. Originally, the patterns were easier - # to construct and this method more robust, because this method - # generated search regexes on the fly; however, this was - # computationally expensive and slowed down the entire REXML package + # to construct and this method more robust, because this method + # generated search regexps on the fly; however, this was + # computationally expensive and slowed down the entire REXML package # considerably, since this is by far the most commonly called method. # @param pattern must be a Regexp, and must be in the form of # /^\s*(#{your pattern, with no groups})(.*)/. The first group @@ -123,6 +120,38 @@ module REXML res = res[-1] if res.kind_of? Array lines.index( res ) if res end + + private + def detect_encoding + buffer_encoding = @buffer.encoding + detected_encoding = "UTF-8" + begin + @buffer.force_encoding("ASCII-8BIT") + if @buffer[0, 2] == "\xfe\xff" + @buffer[0, 2] = "" + detected_encoding = "UTF-16BE" + elsif @buffer[0, 2] == "\xff\xfe" + @buffer[0, 2] = "" + detected_encoding = "UTF-16LE" + elsif @buffer[0, 3] == "\xef\xbb\xbf" + @buffer[0, 3] = "" + detected_encoding = "UTF-8" + end + ensure + @buffer.force_encoding(buffer_encoding) + end + self.encoding = detected_encoding + end + + def encoding_updated + if @encoding != 'UTF-8' + @buffer = decode(@buffer) + @to_utf = true + else + @to_utf = false + @buffer.force_encoding ::Encoding::UTF_8 + end + end end # A Source that wraps an IO. See the Source class for method @@ -134,30 +163,22 @@ module REXML def initialize(arg, block_size=500, encoding=nil) @er_source = @source = arg @to_utf = false + @pending_buffer = nil - # Determining the encoding is a deceptively difficult issue to resolve. - # First, we check the first two bytes for UTF-16. Then we - # assume that the encoding is at least ASCII enough for the '>', and - # we read until we get one of those. This gives us the XML declaration, - # if there is one. If there isn't one, the file MUST be UTF-8, as per - # the XML spec. If there is one, we can determine the encoding from - # it. - @buffer = "" - str = @source.read( 2 ) if encoding - self.encoding = encoding - elsif 0xfe == str[0] && 0xff == str[1] - @line_break = "\000>" - elsif 0xff == str[0] && 0xfe == str[1] - @line_break = ">\000" - elsif 0xef == str[0] && 0xbb == str[1] - str += @source.read(1) - str = '' if (0xbf == str[2]) - @line_break = ">" + super("", encoding) + else + super(@source.read(3) || "") + end + + if !@to_utf and + @buffer.respond_to?(:force_encoding) and + @source.respond_to?(:external_encoding) and + @source.external_encoding != ::Encoding::UTF_8 + @force_utf8 = true else - @line_break = ">" + @force_utf8 = false end - super str+@source.readline( @line_break ) end def scan(pattern, cons=false) @@ -165,16 +186,12 @@ module REXML # You'll notice that this next section is very similar to the same # section in match(), but just a liiittle different. This is # because it is a touch faster to do it this way with scan() - # than the way match() does it; enough faster to warrent duplicating + # than the way match() does it; enough faster to warrant duplicating # some code if rv.size == 0 until @buffer =~ pattern or @source.nil? begin - # READLINE OPT - #str = @source.read(@block_size) - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + @buffer << readline rescue Iconv::IllegalSequence raise rescue @@ -189,9 +206,7 @@ module REXML def read begin - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + @buffer << readline rescue Exception, NameError @source = nil end @@ -206,9 +221,7 @@ module REXML @buffer = $' if cons and rv while !rv and @source begin - str = @source.readline(@line_break) - str = decode(str) if @to_utf and str - @buffer << str + @buffer << readline rv = pattern.match(@buffer) @buffer = $' if cons and rv rescue @@ -218,13 +231,13 @@ module REXML rv.taint rv end - + def empty? super and ( @source.nil? || @source.eof? ) end def position - @er_source.stat.pipe? ? 0 : @er_source.pos + @er_source.pos rescue 0 end # @return the current line in the source @@ -247,5 +260,38 @@ module REXML end [pos, lineno, line] end + + private + def readline + str = @source.readline(@line_break) + if @pending_buffer + if str.nil? + str = @pending_buffer + else + str = @pending_buffer + str + end + @pending_buffer = nil + end + return nil if str.nil? + + if @to_utf + decode(str) + else + str.force_encoding(::Encoding::UTF_8) if @force_utf8 + str + end + end + + def encoding_updated + case @encoding + when "UTF-16BE", "UTF-16LE" + @source.binmode + @source.set_encoding(@encoding, @encoding) + end + @line_break = encode(">") + @pending_buffer, @buffer = @buffer, "" + @pending_buffer.force_encoding(@encoding) + super + end end end diff --git a/lib/rexml/streamlistener.rb b/lib/rexml/streamlistener.rb index 6f401125b5..30c8945179 100644 --- a/lib/rexml/streamlistener.rb +++ b/lib/rexml/streamlistener.rb @@ -1,92 +1,93 @@ +# frozen_string_literal: false module REXML - # A template for stream parser listeners. - # Note that the declarations (attlistdecl, elementdecl, etc) are trivially - # processed; REXML doesn't yet handle doctype entity declarations, so you - # have to parse them out yourself. - module StreamListener - # Called when a tag is encountered. - # @p name the tag name - # @p attrs an array of arrays of attribute/value pairs, suitable for - # use with assoc or rassoc. IE, <tag attr1="value1" attr2="value2"> - # will result in - # tag_start( "tag", # [["attr1","value1"],["attr2","value2"]]) - def tag_start name, attrs - end - # Called when the end tag is reached. In the case of <tag/>, tag_end - # will be called immidiately after tag_start - # @p the name of the tag - def tag_end name - end - # Called when text is encountered in the document - # @p text the text content. - def text text - end - # Called when an instruction is encountered. EG: <?xsl sheet='foo'?> - # @p name the instruction name; in the example, "xsl" - # @p instruction the rest of the instruction. In the example, - # "sheet='foo'" - def instruction name, instruction - end - # Called when a comment is encountered. - # @p comment The content of the comment - def comment comment - end - # Handles a doctype declaration. Any attributes of the doctype which are - # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> - # @p name the name of the doctype; EG, "me" - # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" - # @p long_name the supplied long name, or nil. EG, "foo" - # @p uri the uri of the doctype, or nil. EG, "bar" - def doctype name, pub_sys, long_name, uri - end - # Called when the doctype is done - def doctype_end - end - # If a doctype includes an ATTLIST declaration, it will cause this - # method to be called. The content is the declaration itself, unparsed. - # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el - # attr CDATA #REQUIRED". This is the same for all of the .*decl - # methods. - def attlistdecl element_name, attributes, raw_content - end - # <!ELEMENT ...> - def elementdecl content - end - # <!ENTITY ...> - # The argument passed to this method is an array of the entity - # declaration. It can be in a number of formats, but in general it - # returns (example, result): - # <!ENTITY % YN '"Yes"'> - # ["%", "YN", "'\"Yes\"'", "\""] - # <!ENTITY % YN 'Yes'> - # ["%", "YN", "'Yes'", "s"] - # <!ENTITY WhatHeSaid "He said %YN;"> - # ["WhatHeSaid", "\"He said %YN;\"", "YN"] - # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> - # ["open-hatch", "SYSTEM", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] - # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> - # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] - # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> - # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] - def entitydecl content - end - # <!NOTATION ...> - def notationdecl content - end - # Called when %foo; is encountered in a doctype declaration. - # @p content "foo" - def entity content - end - # Called when <![CDATA[ ... ]]> is encountered in a document. - # @p content "..." - def cdata content - end - # Called when an XML PI is encountered in the document. - # EG: <?xml version="1.0" encoding="utf"?> - # @p version the version attribute value. EG, "1.0" - # @p encoding the encoding attribute value, or nil. EG, "utf" - # @p standalone the standalone attribute value, or nil. EG, nil - def xmldecl version, encoding, standalone - end - end + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + module StreamListener + # Called when a tag is encountered. + # @p name the tag name + # @p attrs an array of arrays of attribute/value pairs, suitable for + # use with assoc or rassoc. IE, <tag attr1="value1" attr2="value2"> + # will result in + # tag_start( "tag", # [["attr1","value1"],["attr2","value2"]]) + def tag_start name, attrs + end + # Called when the end tag is reached. In the case of <tag/>, tag_end + # will be called immediately after tag_start + # @p the name of the tag + def tag_end name + end + # Called when text is encountered in the document + # @p text the text content. + def text text + end + # Called when an instruction is encountered. EG: <?xsl sheet='foo'?> + # @p name the instruction name; in the example, "xsl" + # @p instruction the rest of the instruction. In the example, + # "sheet='foo'" + def instruction name, instruction + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, <!DOCTYPE me PUBLIC "foo" "bar"> + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # Called when the doctype is done + def doctype_end + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, <!ATTLIST el attr CDATA #REQUIRED> will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl element_name, attributes, raw_content + end + # <!ELEMENT ...> + def elementdecl content + end + # <!ENTITY ...> + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # <!ENTITY % YN '"Yes"'> + # ["YN", "\"Yes\"", "%"] + # <!ENTITY % YN 'Yes'> + # ["YN", "Yes", "%"] + # <!ENTITY WhatHeSaid "He said %YN;"> + # ["WhatHeSaid", "He said %YN;"] + # <!ENTITY open-hatch SYSTEM "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "SYSTEM", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY open-hatch PUBLIC "-//Textuality//TEXT Standard open-hatch boilerplate//EN" "http://www.textuality.com/boilerplate/OpenHatch.xml"> + # ["open-hatch", "PUBLIC", "-//Textuality//TEXT Standard open-hatch boilerplate//EN", "http://www.textuality.com/boilerplate/OpenHatch.xml"] + # <!ENTITY hatch-pic SYSTEM "../grafix/OpenHatch.gif" NDATA gif> + # ["hatch-pic", "SYSTEM", "../grafix/OpenHatch.gif", "gif"] + def entitydecl content + end + # <!NOTATION ...> + def notationdecl content + end + # Called when %foo; is encountered in a doctype declaration. + # @p content "foo" + def entity content + end + # Called when <![CDATA[ ... ]]> is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: <?xml version="1.0" encoding="utf"?> + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil + def xmldecl version, encoding, standalone + end + end end diff --git a/lib/rexml/syncenumerator.rb b/lib/rexml/syncenumerator.rb index 955e006cb2..a9d2ad7f9c 100644 --- a/lib/rexml/syncenumerator.rb +++ b/lib/rexml/syncenumerator.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false module REXML class SyncEnumerator include Enumerable @@ -6,8 +7,7 @@ module REXML # Enumerable objects. def initialize(*enums) @gens = enums - @biggest = @gens[0] - @gens.each {|x| @biggest = x if x.size > @biggest.size } + @length = @gens.collect {|x| x.size }.max end # Returns the number of enumerated Enumerable objects, i.e. the size @@ -24,8 +24,8 @@ module REXML # Enumerates rows of the Enumerable objects. def each - @biggest.zip( *@gens ) {|a| - yield(*a[1..-1]) + @length.times {|i| + yield @gens.collect {|x| x[i]} } self end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 2bc00429b3..86269dea1e 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -1,3 +1,5 @@ +# frozen_string_literal: false +require 'rexml/security' require 'rexml/entity' require 'rexml/doctype' require 'rexml/child' @@ -18,25 +20,57 @@ module REXML # If +raw+ is true, then REXML leaves the value alone attr_accessor :raw - ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um - NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + VALID_CHAR = [ + 0x9, 0xA, 0xD, + (0x20..0xD7FF), + (0xE000..0xFFFD), + (0x10000..0x10FFFF) + ] + + if String.method_defined? :encode + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Integer + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') + else + VALID_XML_CHARS = /^( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*$/nx; + end # Constructor # +arg+ if a String, the content is set to the String. If a Text, - # the object is shallowly cloned. + # the object is shallowly cloned. # # +respect_whitespace+ (boolean, false) if true, whitespace is # respected # # +parent+ (nil) if this is a Parent object, the parent - # will be set to this. + # will be set to this. # # +raw+ (nil) This argument can be given three values. - # If true, then the value of used to construct this object is expected to - # contain no unescaped XML markup, and REXML will not change the text. If + # If true, then the value of used to construct this object is expected to + # contain no unescaped XML markup, and REXML will not change the text. If # this value is false, the string may contain any characters, and REXML will # escape any and all defined entities whose values are contained in the - # text. If this value is nil (the default), then the raw value of the + # text. If this value is nil (the default), then the raw value of the # parent will be used as the raw value for this node. If there is no raw # value for the parent, and no value is supplied, the default is false. # Use this field if you have entities defined for some text, and you don't @@ -56,25 +90,24 @@ module REXML # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" # In the last example, the +entity_filter+ argument is ignored. # - # +pattern+ INTERNAL USE ONLY - def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, - entity_filter=nil, illegal=ILLEGAL ) + # +illegal+ INTERNAL USE ONLY + def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, + entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) @raw = false + @parent = nil if parent super( parent ) - @raw = parent.raw - else - @parent = nil + @raw = parent.raw end @raw = raw unless raw.nil? @entity_filter = entity_filter - @normalized = @unnormalized = nil + clear_cache if arg.kind_of? String - @string = arg.clone + @string = arg.dup @string.squeeze!(" \n\t") unless respect_whitespace elsif arg.kind_of? Text @string = arg.to_s @@ -85,10 +118,55 @@ module REXML @string.gsub!( /\r\n?/, "\n" ) - # check for illegal characters - if @raw - if @string =~ illegal - raise "Illegal character '#{$1}' in raw string \"#{@string}\"" + Text.check(@string, illegal, doctype) if @raw + end + + def parent= parent + super(parent) + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + # check for illegal characters + def Text.check string, pattern, doctype + + # illegal anywhere + if string !~ VALID_XML_CHARS + if String.method_defined? :encode + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + else + string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| + case c.unpack('U') + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + end + end + + # context sensitive + string.scan(pattern) do + if $1[-1] != ?; + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + elsif $1[0] == ?& + if $5 and $5[0] == ?# + case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + end + # FIXME: below can't work but this needs API change. + # elsif @parent and $3 and !SUBSTITUTES.include?($1) + # if !doctype or !doctype.entities.has_key?($3) + # raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" + # end + end end end end @@ -109,8 +187,13 @@ module REXML # Appends text to this text node. The text is appended in the +raw+ mode # of this text node. + # + # +returns+ the text itself to enable method chain like + # 'text << "XXX" << "YYY"'. def <<( to_append ) @string << to_append.gsub( /\r\n?/, "\n" ) + clear_cache + self end @@ -120,17 +203,24 @@ module REXML to_s() <=> other.to_s end + def doctype + if @parent + doc = @parent.document + doc.doctype if doc + end + end + REFERENCE = /#{Entity::REFERENCE}/ # Returns the string value of this text node. This string is always # escaped, meaning that it is a valid XML text node string, and all # entities that can be escaped, have been inserted. This method respects # the entity filter set in the constructor. - # - # # Assume that the entity "s" is defined to be "sean", and that the + # + # # Assume that the entity "s" is defined to be "sean", and that the # # entity "r" is defined to be "russell" - # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) # t.to_s #-> "< & &s; russell" - # t = Text.new( "< & &s; russell", false, nil, false ) + # t = Text.new( "< & &s; russell", false, nil, false ) # t.to_s #-> "< & &s; russell" # u = Text.new( "sean russell", false, nil, true ) # u.to_s #-> "sean russell" @@ -138,12 +228,6 @@ module REXML return @string if @raw return @normalized if @normalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end - @normalized = Text::normalize( @string, doctype, @entity_filter ) end @@ -156,25 +240,20 @@ module REXML # console. This ignores the 'raw' attribute setting, and any # entity_filter. # - # # Assume that the entity "s" is defined to be "sean", and that the + # # Assume that the entity "s" is defined to be "sean", and that the # # entity "r" is defined to be "russell" - # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) # t.value #-> "< & sean russell" # t = Text.new( "< & &s; russell", false, nil, false ) # t.value #-> "< & sean russell" # u = Text.new( "sean russell", false, nil, true ) # u.value #-> "sean russell" def value - @unnormalized if @unnormalized - doctype = nil - if @parent - doc = @parent.document - doctype = doc.doctype if doc - end + return @unnormalized if @unnormalized @unnormalized = Text::unnormalize( @string, doctype ) end - # Sets the contents of this text node. This expects the text to be + # Sets the contents of this text node. This expects the text to be # unnormalized. It returns self. # # e = Element.new( "a" ) @@ -183,11 +262,10 @@ module REXML # e[0].value = "<a>" # <a><a></a> def value=( val ) @string = val.gsub( /\r\n?/, "\n" ) - @unnormalized = nil - @normalized = nil + clear_cache @raw = false end - + def wrap(string, width, addnewline=false) # Recursively wrap string at width. return string if string.length <= width @@ -202,7 +280,7 @@ module REXML def indent_text(string, level=1, style="\t", indentfirstline=true) return string if level < 0 new_string = '' - string.each { |line| + string.each_line { |line| indent_string = style * level new_line = (indent_string + line).sub(/[\s]+$/,'') new_string << new_line @@ -210,12 +288,12 @@ module REXML new_string.strip! unless indentfirstline return new_string end - + # == DEPRECATED # See REXML::Formatters # - def write( writer, indent=-1, transitive=false, ie_hack=false ) - Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") + def write( writer, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters", uplevel: 1) formatter = if indent > -1 REXML::Formatters::Pretty.new( indent ) else @@ -258,6 +336,12 @@ module REXML out << copy end + private + def clear_cache + @normalized = nil + @unnormalized = nil + end + # Reads text, substituting entities def Text::read_with_substitution( input, illegal=nil ) copy = input.clone @@ -265,7 +349,7 @@ module REXML if copy =~ illegal raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) end if illegal - + copy.gsub!( /\r\n?/, "\n" ) if copy.include? ?& copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) @@ -273,7 +357,7 @@ module REXML copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) - copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m| + copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) { m=$1 #m='0' if m=='' m = "0#{m}" if m[0] == ?x @@ -286,16 +370,16 @@ module REXML EREFERENCE = /&(?!#{Entity::NAME};)/ # Escapes all possible entities def Text::normalize( input, doctype=nil, entity_filter=nil ) - copy = input + copy = input.to_s # Doing it like this rather than in a loop improves the speed #copy = copy.gsub( EREFERENCE, '&' ) copy = copy.gsub( "&", "&" ) if doctype # Replace all ampersands that aren't part of an entity doctype.entities.each_value do |entity| - copy = copy.gsub( entity.value, - "&#{entity.name};" ) if entity.value and - not( entity_filter and entity_filter.include?(entity) ) + copy = copy.gsub( entity.value, + "&#{entity.name};" ) if entity.value and + not( entity_filter and entity_filter.include?(entity.name) ) end else # Replace all ampersands that aren't part of an entity @@ -308,37 +392,35 @@ module REXML # Unescapes all possible entities def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) - rv = string.clone - rv.gsub!( /\r\n?/, "\n" ) - matches = rv.scan( REFERENCE ) - return rv if matches.size == 0 - rv.gsub!( NUMERICENTITY ) {|m| - m=$1 - m = "0#{m}" if m[0] == ?x - [Integer(m)].pack('U*') + sum = 0 + string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { + s = Text.expand($&, doctype, filter) + if sum + s.bytesize > Security.entity_expansion_text_limit + raise "entity expansion has grown too large" + else + sum += s.bytesize + end + s } - matches.collect!{|x|x[0]}.compact! - if matches.size > 0 - if doctype - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = doctype.entity( entity_reference ) - re = /&#{entity_reference};/ - rv.gsub!( re, entity_value ) if entity_value - end - end + end + + def Text.expand(ref, doctype, filter) + if ref[1] == ?# + if ref[2] == ?x + [ref[3...-1].to_i(16)].pack('U*') else - matches.each do |entity_reference| - unless filter and filter.include?(entity_reference) - entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ] - re = /&#{entity_reference};/ - rv.gsub!( re, entity_value.value ) if entity_value - end - end + [ref[2...-1].to_i].pack('U*') end - rv.gsub!( /&/, '&' ) + elsif ref == '&' + '&' + elsif filter and filter.include?( ref[1...-1] ) + ref + elsif doctype + doctype.entity( ref[1...-1] ) or ref + else + entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] + entity_value ? entity_value.value : ref end - rv end end end diff --git a/lib/rexml/undefinednamespaceexception.rb b/lib/rexml/undefinednamespaceexception.rb index 8ebfdfd0a9..e522ed57ea 100644 --- a/lib/rexml/undefinednamespaceexception.rb +++ b/lib/rexml/undefinednamespaceexception.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/parseexception' module REXML class UndefinedNamespaceException < ParseException diff --git a/lib/rexml/validation/relaxng.rb b/lib/rexml/validation/relaxng.rb index 969f51bc95..fb52438290 100644 --- a/lib/rexml/validation/relaxng.rb +++ b/lib/rexml/validation/relaxng.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require "rexml/validation/validation" require "rexml/parsers/baseparser" @@ -79,7 +80,7 @@ module REXML when "mixed" states << Interleave.new( self ) states[-2] << states[-1] - states[-1] << TEXT + states[-1] << TEXT when "define" states << [ event[2]["name"] ] when "ref" @@ -102,7 +103,7 @@ module REXML case event[1] when "element", "attribute" states[-1] << event - when "zeroOrMore", "oneOrMore", "choice", "optional", + when "zeroOrMore", "oneOrMore", "choice", "optional", "interleave", "group", "mixed" states.pop when "define" @@ -139,13 +140,12 @@ module REXML @events.each {|s| s.reset if s.kind_of? State } end - def previous=( previous ) + def previous=( previous ) @previous << previous end def next( event ) #print "In next with #{event.inspect}. " - #puts "Next (#@current) is #{@events[@current]}" #p @previous return @previous.pop.next( event ) if @events[@current].nil? expand_ref_in( @events, @current ) if @events[@current].class == Ref @@ -154,19 +154,15 @@ module REXML @events[@current-1].previous = self return @events[@current-1].next( event ) end - #puts "Current isn't a state" if ( @events[@current].matches?(event) ) @current += 1 if @events[@current].nil? - #puts "#{inspect[0,5]} 1RETURNING #{@previous.inspect[0,5]}" return @previous.pop elsif @events[@current].kind_of? State @current += 1 - #puts "#{inspect[0,5]} 2RETURNING (#{@current-1}) #{@events[@current-1].inspect[0,5]}; on return, next is #{@events[@current]}" @events[@current-1].previous = self return @events[@current-1] else - #puts "#{inspect[0,5]} RETURNING self w/ next(#@current) = #{@events[@current]}" return self end else @@ -183,7 +179,7 @@ module REXML end def inspect - "< #{to_s} #{@events.collect{|e| + "< #{to_s} #{@events.collect{|e| pre = e == @events[@current] ? '#' : '' pre + e.inspect unless self == e }.join(', ')} >" @@ -201,15 +197,15 @@ module REXML protected def expand_ref_in( arry, ind ) new_events = [] - @references[ arry[ind].to_s ].each{ |evt| + @references[ arry[ind].to_s ].each{ |evt| add_event_to_arry(new_events,evt) } arry[ind,1] = new_events end - def add_event_to_arry( arry, evt ) + def add_event_to_arry( arry, evt ) evt = generate_event( evt ) - if evt.kind_of? String + if evt.kind_of? String arry[-1].event_arg = evt if arry[-1].kind_of? Event and @value @value = false else @@ -272,7 +268,7 @@ module REXML end def matches?(event) - @events[@current].matches?(event) || + @events[@current].matches?(event) || (@current == 0 and @previous[-1].matches?(event)) end @@ -319,7 +315,7 @@ module REXML end def reset - super + super @ord = 0 end @@ -345,7 +341,7 @@ module REXML end def matches?( event ) - @events[@current].matches?(event) || + @events[@current].matches?(event) || (@current == 0 and @ord > 0 and @previous[-1].matches?(event)) end @@ -393,13 +389,10 @@ module REXML # Remove the references # Find the events end - #puts "In next with #{event.inspect}." - #puts "events is #{@events.inspect}" unless @events @events = [] return nil end - #puts "current = #@current" super end @@ -409,10 +402,8 @@ module REXML end def expected - #puts "IN CHOICE EXPECTED" - #puts "EVENTS = #{@events.inspect}" return [@events[@current]] if @events.size > 0 - return @choices.collect do |x| + return @choices.collect do |x| if x[0].kind_of? State x[0].expected else @@ -426,17 +417,17 @@ module REXML end protected - def add_event_to_arry( arry, evt ) + def add_event_to_arry( arry, evt ) if evt.kind_of? State or evt.class == Ref arry << [evt] - elsif evt[0] == :text + elsif evt[0] == :text if arry[-1] and - arry[-1][-1].kind_of?( Event ) and + arry[-1][-1].kind_of?( Event ) and arry[-1][-1].event_type == :text and @value arry[-1][-1].event_arg = evt[1] @value = false - end + end else arry << [] if evt[0] == :start_element arry[-1] << generate_event( evt ) @@ -478,9 +469,7 @@ module REXML @choices[idx] = old @choice += 1 end - - #puts "In next with #{event.inspect}." - #puts "events is #{@events.inspect}" + @events = [] unless @events end @@ -490,30 +479,23 @@ module REXML next_current(event) unless @events[@current] return nil unless @events[@current] - expand_ref_in( @events, @current ) if @events[@current].class == Ref - #puts "In next with #{event.inspect}." - #puts "Next (#@current) is #{@events[@current]}" + expand_ref_in( @events, @current ) if @events[@current].class == Ref if ( @events[@current].kind_of? State ) @current += 1 @events[@current-1].previous = self return @events[@current-1].next( event ) end - #puts "Current isn't a state" return @previous.pop.next( event ) if @events[@current].nil? if ( @events[@current].matches?(event) ) @current += 1 if @events[@current].nil? - #puts "#{inspect[0,5]} 1RETURNING self" unless @choices[@choice].nil? return self unless @choices[@choice].nil? - #puts "#{inspect[0,5]} 1RETURNING #{@previous[-1].inspect[0,5]}" return @previous.pop elsif @events[@current].kind_of? State @current += 1 - #puts "#{inspect[0,5]} 2RETURNING (#{@current-1}) #{@events[@current-1].inspect[0,5]}; on return, next is #{@events[@current]}" @events[@current-1].previous = self return @events[@current-1] else - #puts "#{inspect[0,5]} RETURNING self w/ next(#@current) = #{@events[@current]}" return self end else @@ -527,10 +509,8 @@ module REXML end def expected - #puts "IN CHOICE EXPECTED" - #puts "EVENTS = #{@events.inspect}" return [@events[@current]] if @events[@current] - return @choices[@choice..-1].collect do |x| + return @choices[@choice..-1].collect do |x| if x[0].kind_of? State x[0].expected else diff --git a/lib/rexml/validation/validation.rb b/lib/rexml/validation/validation.rb index 160ea96b31..f0c76f976c 100644 --- a/lib/rexml/validation/validation.rb +++ b/lib/rexml/validation/validation.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/validation/validationexception' module REXML @@ -14,9 +15,7 @@ module REXML def dump puts @root.inspect end - def validate( event ) - #puts "Current: #@current" - #puts "Event: #{event.inspect}" + def validate( event ) @attr_stack = [] unless defined? @attr_stack match = @current.next(event) raise ValidationException.new( "Validation error. Expected: "+ @@ -27,30 +26,21 @@ module REXML # Check for attributes case event[0] when :start_element - #puts "Checking attributes" @attr_stack << event[2] begin sattr = [:start_attribute, nil] eattr = [:end_attribute] text = [:text, nil] - k,v = event[2].find { |k,v| - sattr[1] = k - #puts "Looking for #{sattr.inspect}" + k, = event[2].find { |key,value| + sattr[1] = key m = @current.next( sattr ) - #puts "Got #{m.inspect}" - if m + if m # If the state has text children... - #puts "Looking for #{eattr.inspect}" - #puts "Expect #{m.expected}" if m.matches?( eattr ) - #puts "Got end" @current = m else - #puts "Didn't get end" - text[1] = v - #puts "Looking for #{text.inspect}" + text[1] = value m = m.next( text ) - #puts "Got #{m.inspect}" text[1] = nil return false unless m @current = m if m @@ -94,7 +84,6 @@ module REXML end def matches?( event ) - #puts "#@event_type =? #{event[0]} && #@event_arg =? #{event[1]} " return false unless event[0] == @event_type case event[0] when nil diff --git a/lib/rexml/validation/validationexception.rb b/lib/rexml/validation/validationexception.rb index 4723d9e4d3..78cd63fd04 100644 --- a/lib/rexml/validation/validationexception.rb +++ b/lib/rexml/validation/validationexception.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false module REXML module Validation class ValidationException < RuntimeError diff --git a/lib/rexml/xmldecl.rb b/lib/rexml/xmldecl.rb index 427eb78cf8..a37e9f3ddc 100644 --- a/lib/rexml/xmldecl.rb +++ b/lib/rexml/xmldecl.rb @@ -1,41 +1,42 @@ +# frozen_string_literal: false require 'rexml/encoding' require 'rexml/source' module REXML - # NEEDS DOCUMENTATION - class XMLDecl < Child - include Encoding + # NEEDS DOCUMENTATION + class XMLDecl < Child + include Encoding - DEFAULT_VERSION = "1.0"; - DEFAULT_ENCODING = "UTF-8"; - DEFAULT_STANDALONE = "no"; - START = '<\?xml'; - STOP = '\?>'; + DEFAULT_VERSION = "1.0"; + DEFAULT_ENCODING = "UTF-8"; + DEFAULT_STANDALONE = "no"; + START = '<\?xml'; + STOP = '\?>'; - attr_accessor :version, :standalone + attr_accessor :version, :standalone attr_reader :writeencoding, :writethis - def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) + def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) @writethis = true @writeencoding = !encoding.nil? - if version.kind_of? XMLDecl - super() - @version = version.version - self.encoding = version.encoding + if version.kind_of? XMLDecl + super() + @version = version.version + self.encoding = version.encoding @writeencoding = version.writeencoding - @standalone = version.standalone - else - super() - @version = version - self.encoding = encoding - @standalone = standalone - end - @version = DEFAULT_VERSION if @version.nil? - end - - def clone - XMLDecl.new(self) - end + @standalone = version.standalone + else + super() + @version = version + self.encoding = encoding + @standalone = standalone + end + @version = DEFAULT_VERSION if @version.nil? + end + + def clone + XMLDecl.new(self) + end # indent:: # Ignored. There must be no whitespace before an XML declaration @@ -43,35 +44,31 @@ module REXML # Ignored # ie_hack:: # Ignored - def write(writer, indent=-1, transitive=false, ie_hack=false) + def write(writer, indent=-1, transitive=false, ie_hack=false) return nil unless @writethis or writer.kind_of? Output - writer << START.sub(/\\/u, '') - if writer.kind_of? Output - writer << " #{content writer.encoding}" - else - writer << " #{content encoding}" - end - writer << STOP.sub(/\\/u, '') - end - - def ==( other ) - other.kind_of?(XMLDecl) and - other.version == @version and - other.encoding == self.encoding and - other.standalone == @standalone - end - - def xmldecl version, encoding, standalone - @version = version - self.encoding = encoding - @standalone = standalone - end - - def node_type - :xmldecl - end - - alias :stand_alone? :standalone + writer << START.sub(/\\/u, '') + writer << " #{content encoding}" + writer << STOP.sub(/\\/u, '') + end + + def ==( other ) + other.kind_of?(XMLDecl) and + other.version == @version and + other.encoding == self.encoding and + other.standalone == @standalone + end + + def xmldecl version, encoding, standalone + @version = version + self.encoding = encoding + @standalone = standalone + end + + def node_type + :xmldecl + end + + alias :stand_alone? :standalone alias :old_enc= :encoding= def encoding=( enc ) @@ -108,12 +105,12 @@ module REXML START.sub(/\\/u, '') + " ... " + STOP.sub(/\\/u, '') end - private - def content(enc) - rv = "version='#@version'" - rv << " encoding='#{enc}'" if @writeencoding || enc !~ /utf-8/i - rv << " standalone='#@standalone'" if @standalone - rv - end - end + private + def content(enc) + rv = "version='#@version'" + rv << " encoding='#{enc}'" if @writeencoding || enc !~ /\Autf-8\z/i + rv << " standalone='#@standalone'" if @standalone + rv + end + end end diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb index 6bbe5b07d5..392b47b1d3 100644 --- a/lib/rexml/xmltokens.rb +++ b/lib/rexml/xmltokens.rb @@ -1,18 +1,85 @@ +# frozen_string_literal: false module REXML - # Defines a number of tokens used for parsing XML. Not for general - # consumption. - module XMLTokens - NCNAME_STR= '[\w:][\-\w\d.]*' - NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + # Defines a number of tokens used for parsing XML. Not for general + # consumption. + module XMLTokens + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4] NameStartChar ::= + # ":" | + # [A-Z] | + # "_" | + # [a-z] | + # [#xC0-#xD6] | + # [#xD8-#xF6] | + # [#xF8-#x2FF] | + # [#x370-#x37D] | + # [#x37F-#x1FFF] | + # [#x200C-#x200D] | + # [#x2070-#x218F] | + # [#x2C00-#x2FEF] | + # [#x3001-#xD7FF] | + # [#xF900-#xFDCF] | + # [#xFDF0-#xFFFD] | + # [#x10000-#xEFFFF] + name_start_chars = [ + ":", + "A-Z", + "_", + "a-z", + "\\u00C0-\\u00D6", + "\\u00D8-\\u00F6", + "\\u00F8-\\u02FF", + "\\u0370-\\u037D", + "\\u037F-\\u1FFF", + "\\u200C-\\u200D", + "\\u2070-\\u218F", + "\\u2C00-\\u2FEF", + "\\u3001-\\uD7FF", + "\\uF900-\\uFDCF", + "\\uFDF0-\\uFFFD", + "\\u{10000}-\\u{EFFFF}", + ] + # From http://www.w3.org/TR/REC-xml/#sec-common-syn + # + # [4a] NameChar ::= + # NameStartChar | + # "-" | + # "." | + # [0-9] | + # #xB7 | + # [#x0300-#x036F] | + # [#x203F-#x2040] + name_chars = name_start_chars + [ + "\\-", + "\\.", + "0-9", + "\\u00B7", + "\\u0300-\\u036F", + "\\u203F-\\u2040", + ] + NAME_START_CHAR = "[#{name_start_chars.join('')}]" + NAME_CHAR = "[#{name_chars.join('')}]" + NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead. - NAMECHAR = '[\-\w\d\.:]' - NAME = "([\\w:]#{NAMECHAR}*)" - NMTOKEN = "(?:#{NAMECHAR})+" - NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" - REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [6] NCNameStartChar ::= NameStartChar - ':' + ncname_start_chars = name_start_chars - [":"] + # From http://www.w3.org/TR/xml-names11/#NT-NCName + # + # [5] NCNameChar ::= NameChar - ':' + ncname_chars = name_chars - [":"] + NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*" + NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" - #REFERENCE = "(?:#{ENTITYREF}|#{CHARREF})" - #ENTITYREF = "&#{NAME};" - #CHARREF = "&#\\d+;|&#x[0-9a-fA-F]+;" - end + NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)" + NMTOKEN = "(?:#{NAME_CHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + + #REFERENCE = "(?:#{ENTITYREF}|#{CHARREF})" + #ENTITYREF = "&#{NAME};" + #CHARREF = "&#\\d+;|&#x[0-9a-fA-F]+;" + end end diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb index e8813efd3e..f1cb99baea 100644 --- a/lib/rexml/xpath.rb +++ b/lib/rexml/xpath.rb @@ -1,66 +1,81 @@ +# frozen_string_literal: false require 'rexml/functions' require 'rexml/xpath_parser' module REXML - # Wrapper class. Use this class to access the XPath functions. - class XPath - include Functions - EMPTY_HASH = {} + # Wrapper class. Use this class to access the XPath functions. + class XPath + include Functions + # A base Hash object, supposing to be used when initializing a + # default empty namespaces set, but is currently unused. + # TODO: either set the namespaces=EMPTY_HASH, or deprecate this. + EMPTY_HASH = {} - # Finds and returns the first node that matches the supplied xpath. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, returns the first - # node matching '*'. - # namespaces:: - # If supplied, a Hash which defines a namespace mapping. - # - # XPath.first( node ) - # XPath.first( doc, "//b"} ) - # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + # Finds and returns the first node that matches the supplied xpath. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, returns the first + # node matching '*'. + # namespaces:: + # If supplied, a Hash which defines a namespace mapping. + # variables:: + # If supplied, a Hash which maps $variables in the query + # to values. This can be used to avoid XPath injection attacks + # or to automatically handle escaping string values. + # + # XPath.first( node ) + # XPath.first( doc, "//b"} ) + # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + # XPath.first( node, '/book/publisher/text()=$publisher', {}, {"publisher"=>"O'Reilly"}) def XPath::first element, path=nil, namespaces=nil, variables={} raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).flatten[0] - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).flatten[0] + end - # Iterates over nodes that match the given path, calling the supplied - # block with the match. - # element:: - # The context element - # path:: - # The xpath to search for. If not supplied or nil, defaults to '*' - # namespaces:: - # If supplied, a Hash which defines a namespace mapping - # - # XPath.each( node ) { |el| ... } - # XPath.each( node, '/*[@attr='v']' ) { |el| ... } - # XPath.each( node, 'ancestor::x' ) { |el| ... } - def XPath::each element, path=nil, namespaces=nil, variables={}, &block + # Iterates over nodes that match the given path, calling the supplied + # block with the match. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, defaults to '*' + # namespaces:: + # If supplied, a Hash which defines a namespace mapping + # variables:: + # If supplied, a Hash which maps $variables in the query + # to values. This can be used to avoid XPath injection attacks + # or to automatically handle escaping string values. + # + # XPath.each( node ) { |el| ... } + # XPath.each( node, '/*[@attr='v']' ) { |el| ... } + # XPath.each( node, 'ancestor::x' ) { |el| ... } + # XPath.each( node, '/book/publisher/text()=$publisher', {}, {"publisher"=>"O'Reilly"}) \ + # {|el| ... } + def XPath::each element, path=nil, namespaces=nil, variables={}, &block raise "The namespaces argument, if supplied, must be a hash object." unless namespaces.nil? or namespaces.kind_of?(Hash) raise "The variables argument, if supplied, must be a hash object." unless variables.kind_of?(Hash) - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path, element).each( &block ) - end + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).each( &block ) + end - # Returns an array of nodes matching a given XPath. - def XPath::match element, path=nil, namespaces=nil, variables={} - parser = XPathParser.new - parser.namespaces = namespaces - parser.variables = variables - path = "*" unless path - element = [element] unless element.kind_of? Array - parser.parse(path,element) - end - end + # Returns an array of nodes matching a given XPath. + def XPath::match element, path=nil, namespaces=nil, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path,element) + end + end end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb index eb608fdb34..181b2b6e85 100644 --- a/lib/rexml/xpath_parser.rb +++ b/lib/rexml/xpath_parser.rb @@ -1,3 +1,4 @@ +# frozen_string_literal: false require 'rexml/namespace' require 'rexml/xmltokens' require 'rexml/attribute' @@ -5,20 +6,30 @@ require 'rexml/syncenumerator' require 'rexml/parsers/xpathparser' class Object + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types def dclone clone end end class Symbol + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types def dclone ; self ; end end -class Fixnum +class Integer + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types def dclone ; self ; end end class Float + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object types def dclone ; self ; end end class Array + # provides a unified +clone+ operation, for REXML::XPathParser + # to use across multiple Object+ types def dclone klone = self.clone klone.clear @@ -53,19 +64,13 @@ module REXML end def parse path, nodeset - #puts "#"*40 - path_stack = @parser.parse( path ) - #puts "PARSE: #{path} => #{path_stack.inspect}" - #puts "PARSE: nodeset = #{nodeset.inspect}" - match( path_stack, nodeset ) + path_stack = @parser.parse( path ) + match( path_stack, nodeset ) end def get_first path, nodeset - #puts "#"*40 - path_stack = @parser.parse( path ) - #puts "PARSE: #{path} => #{path_stack.inspect}" - #puts "PARSE: nodeset = #{nodeset.inspect}" - first( path_stack, nodeset ) + path_stack = @parser.parse( path ) + first( path_stack, nodeset ) end def predicate path, nodeset @@ -83,25 +88,20 @@ module REXML # # FIXME: This method is incomplete! def first( path_stack, node ) - #puts "#{depth}) Entering match( #{path.inspect}, #{tree.inspect} )" return nil if path.size == 0 case path[0] when :document - # do nothing + # do nothing return first( path[1..-1], node ) when :child for c in node.children - #puts "#{depth}) CHILD checking #{name(c)}" r = first( path[1..-1], c ) - #puts "#{depth}) RETURNING #{r.inspect}" if r return r if r end when :qname name = path[2] - #puts "#{depth}) QNAME #{name(tree)} == #{name} (path => #{path.size})" if node.name == name - #puts "#{depth}) RETURNING #{tree.inspect}" if path.size == 3 return node if path.size == 3 return first( path[3..-1], node ) else @@ -123,11 +123,8 @@ module REXML end - def match( path_stack, nodeset ) - #puts "MATCH: path_stack = #{path_stack.inspect}" - #puts "MATCH: nodeset = #{nodeset.inspect}" + def match( path_stack, nodeset ) r = expr( path_stack, nodeset ) - #puts "MAIN EXPR => #{r.inspect}" r end @@ -136,7 +133,7 @@ module REXML # Returns a String namespace for a node, given a prefix # The rules are: - # + # # 1. Use the supplied namespace mapping first. # 2. If no mapping was supplied, use the context node to look up the namespace def get_namespace( node, prefix ) @@ -154,15 +151,9 @@ module REXML ALL = [ :attribute, :element, :text, :processing_instruction, :comment ] ELEMENTS = [ :element ] def expr( path_stack, nodeset, context=nil ) - #puts "#"*15 - #puts "In expr with #{path_stack.inspect}" - #puts "Returning" if path_stack.length == 0 || nodeset.length == 0 node_types = ELEMENTS return nodeset if path_stack.length == 0 || nodeset.length == 0 while path_stack.length > 0 - #puts "#"*5 - #puts "Path stack = #{path_stack.inspect}" - #puts "Nodeset is #{nodeset.inspect}" if nodeset.length == 0 path_stack.clear return [] @@ -170,34 +161,25 @@ module REXML case (op = path_stack.shift) when :document nodeset = [ nodeset[0].root_node ] - #puts ":document, nodeset = #{nodeset.inspect}" when :qname - #puts "IN QNAME" prefix = path_stack.shift name = path_stack.shift nodeset.delete_if do |node| # FIXME: This DOUBLES the time XPath searches take ns = get_namespace( node, prefix ) - #puts "NS = #{ns.inspect}" - #puts "node.node_type == :element => #{node.node_type == :element}" if node.node_type == :element - #puts "node.name == #{name} => #{node.name == name}" if node.name == name - #puts "node.namespace == #{ns.inspect} => #{node.namespace == ns}" end end - !(node.node_type == :element and - node.name == name and + !(node.node_type == :element and + node.name == name and node.namespace == ns ) end node_types = ELEMENTS when :any - #puts "ANY 1: nodeset = #{nodeset.inspect}" - #puts "ANY 1: node_types = #{node_types.inspect}" nodeset.delete_if { |node| !node_types.include?(node.node_type) } - #puts "ANY 2: nodeset = #{nodeset.inspect}" when :self # This space left intentionally blank @@ -205,7 +187,7 @@ module REXML when :processing_instruction target = path_stack.shift nodeset.delete_if do |node| - (node.node_type != :processing_instruction) or + (node.node_type != :processing_instruction) or ( target!='' and ( node.target != target ) ) end @@ -222,7 +204,7 @@ module REXML when :child new_nodeset = [] nt = nil - for node in nodeset + nodeset.each do |node| nt = node.node_type new_nodeset += node.children if nt == :element or nt == :document end @@ -231,7 +213,7 @@ module REXML when :literal return path_stack.shift - + when :attribute new_nodeset = [] case path_stack.shift @@ -240,15 +222,11 @@ module REXML name = path_stack.shift for element in nodeset if element.node_type == :element - #puts "Element name = #{element.name}" - #puts "get_namespace( #{element.inspect}, #{prefix} ) = #{get_namespace(element, prefix)}" attrib = element.attribute( name, get_namespace(element, prefix) ) - #puts "attrib = #{attrib.inspect}" new_nodeset << attrib if attrib end end when :any - #puts "ANY" for element in nodeset if element.node_type == :element new_nodeset += element.attributes.to_a @@ -258,15 +236,13 @@ module REXML nodeset = new_nodeset when :parent - #puts "PARENT 1: nodeset = #{nodeset}" nodeset = nodeset.collect{|n| n.parent}.compact #nodeset = expr(path_stack.dclone, nodeset.collect{|n| n.parent}.compact) - #puts "PARENT 2: nodeset = #{nodeset.inspect}" node_types = ELEMENTS when :ancestor new_nodeset = [] - for node in nodeset + nodeset.each do |node| while node.parent node = node.parent new_nodeset << node unless new_nodeset.include? node @@ -277,7 +253,7 @@ module REXML when :ancestor_or_self new_nodeset = [] - for node in nodeset + nodeset.each do |node| if node.node_type == :element new_nodeset << node while ( node.parent ) @@ -295,41 +271,30 @@ module REXML pred = path_stack.shift nodeset.each_with_index { |node, index| subcontext[ :node ] = node - #puts "PREDICATE SETTING CONTEXT INDEX TO #{index+1}" subcontext[ :index ] = index+1 pc = pred.dclone - #puts "#{node.hash}) Recursing with #{pred.inspect} and [#{node.inspect}]" result = expr( pc, [node], subcontext ) result = result[0] if result.kind_of? Array and result.length == 1 - #puts "#{node.hash}) Result = #{result.inspect} (#{result.class.name})" if result.kind_of? Numeric - #puts "Adding node #{node.inspect}" if result == (index+1) new_nodeset << node if result == (index+1) elsif result.instance_of? Array if result.size > 0 and result.inject(false) {|k,s| s or k} - #puts "Adding node #{node.inspect}" if result.size > 0 new_nodeset << node if result.size > 0 end else - #puts "Adding node #{node.inspect}" if result new_nodeset << node if result end } - #puts "New nodeset = #{new_nodeset.inspect}" - #puts "Path_stack = #{path_stack.inspect}" nodeset = new_nodeset =begin predicate = path_stack.shift ns = nodeset.clone result = expr( predicate, ns ) - #puts "Result = #{result.inspect} (#{result.class.name})" - #puts "nodeset = #{nodeset.inspect}" if result.kind_of? Array nodeset = result.zip(ns).collect{|m,n| n if m}.compact else nodeset = result ? nodeset : [] end - #puts "Outgoing NS = #{nodeset.inspect}" =end when :descendant_or_self @@ -341,7 +306,7 @@ module REXML when :descendant results = [] nt = nil - for node in nodeset + nodeset.each do |node| nt = node.node_type results += expr( path_stack.dclone.unshift( :descendant_or_self ), node.children ) if nt == :element or nt == :document @@ -350,7 +315,6 @@ module REXML node_types = ELEMENTS when :following_sibling - #puts "FOLLOWING_SIBLING 1: nodeset = #{nodeset}" results = [] nodeset.each do |node| next if node.parent.nil? @@ -359,7 +323,6 @@ module REXML following_siblings = all_siblings[ current_index+1 .. -1 ] results += expr( path_stack.dclone, following_siblings ) end - #puts "FOLLOWING_SIBLING 2: nodeset = #{nodeset}" nodeset = results when :preceding_sibling @@ -376,26 +339,24 @@ module REXML when :preceding new_nodeset = [] - for node in nodeset + nodeset.each do |node| new_nodeset += preceding( node ) end - #puts "NEW NODESET => #{new_nodeset.inspect}" nodeset = new_nodeset node_types = ELEMENTS when :following new_nodeset = [] - for node in nodeset + nodeset.each do |node| new_nodeset += following( node ) end nodeset = new_nodeset node_types = ELEMENTS when :namespace - #puts "In :namespace" new_nodeset = [] prefix = path_stack.shift - for node in nodeset + nodeset.each do |node| if (node.node_type == :element or node.node_type == :attribute) if @namespaces namespaces = @namespaces @@ -404,9 +365,6 @@ module REXML else namespaces = node.element.namesapces end - #puts "Namespaces = #{namespaces.inspect}" - #puts "Prefix = #{prefix.inspect}" - #puts "Node.namespace = #{node.namespace}" if (node.namespace == namespaces[prefix]) new_nodeset << node end @@ -419,28 +377,23 @@ module REXML return @variables[ var_name ] # :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq - # TODO: Special case for :or and :and -- not evaluate the right - # operand if the left alone determines result (i.e. is true for - # :or and false for :and). - when :eq, :neq, :lt, :lteq, :gt, :gteq, :and, :or + # TODO: Special case for :or and :and -- not evaluate the right + # operand if the left alone determines result (i.e. is true for + # :or and false for :and). + when :eq, :neq, :lt, :lteq, :gt, :gteq, :or left = expr( path_stack.shift, nodeset.dup, context ) - #puts "LEFT => #{left.inspect} (#{left.class.name})" right = expr( path_stack.shift, nodeset.dup, context ) - #puts "RIGHT => #{right.inspect} (#{right.class.name})" res = equality_relational_compare( left, op, right ) - #puts "RES => #{res.inspect}" return res when :and left = expr( path_stack.shift, nodeset.dup, context ) - #puts "LEFT => #{left.inspect} (#{left.class.name})" - if left == false || left.nil? || !left.inject(false) {|a,b| a | b} + return [] unless left + if left.respond_to?(:inject) and !left.inject(false) {|a,b| a | b} return [] end right = expr( path_stack.shift, nodeset.dup, context ) - #puts "RIGHT => #{right.inspect} (#{right.class.name})" res = equality_relational_compare( left, op, right ) - #puts "RES => #{res.inspect}" return res when :div @@ -481,32 +434,27 @@ module REXML when :function func_name = path_stack.shift.tr('-','_') arguments = path_stack.shift - #puts "FUNCTION 0: #{func_name}(#{arguments.collect{|a|a.inspect}.join(', ')})" subcontext = context ? nil : { :size => nodeset.size } res = [] cont = context - nodeset.each_with_index { |n, i| + nodeset.each_with_index { |n, i| if subcontext subcontext[:node] = n subcontext[:index] = i cont = subcontext end arg_clone = arguments.dclone - args = arg_clone.collect { |arg| - #puts "FUNCTION 1: Calling expr( #{arg.inspect}, [#{n.inspect}] )" - expr( arg, [n], cont ) + args = arg_clone.collect { |arg| + expr( arg, [n], cont ) } - #puts "FUNCTION 2: #{func_name}(#{args.collect{|a|a.inspect}.join(', ')})" Functions.context = cont res << Functions.send( func_name, *args ) - #puts "FUNCTION 3: #{res[-1].inspect}" } return res end end # while - #puts "EXPR returning #{nodeset.inspect}" return nodeset end @@ -515,27 +463,21 @@ module REXML # FIXME # The next two methods are BAD MOJO! # This is my achilles heel. If anybody thinks of a better - # way of doing this, be my guest. This really sucks, but + # way of doing this, be my guest. This really sucks, but # it is a wonder it works at all. # ######################################################## - + def descendant_or_self( path_stack, nodeset ) rs = [] - #puts "#"*80 - #puts "PATH_STACK = #{path_stack.inspect}" - #puts "NODESET = #{nodeset.collect{|n|n.inspect}.inspect}" d_o_s( path_stack, nodeset, rs ) - #puts "RS = #{rs.collect{|n|n.inspect}.inspect}" document_order(rs.flatten.compact) #rs.flatten.compact end def d_o_s( p, ns, r ) - #puts "IN DOS with #{ns.inspect}; ALREADY HAVE #{r.inspect}" nt = nil ns.each_index do |i| n = ns[i] - #puts "P => #{p.inspect}" x = expr( p.dclone, [ n ] ) nt = n.node_type d_o_s( p, n.children, x ) if nt == :element or nt == :document and n.children.size > 0 @@ -547,7 +489,7 @@ module REXML # Reorders an array of nodes so that they are in document order # It tries to do this efficiently. # - # FIXME: I need to get rid of this, but the issue is that most of the XPath + # FIXME: I need to get rid of this, but the issue is that most of the XPath # interpreter functions as a filter, which means that we lose context going # in and out of function calls. If I knew what the index of the nodes was, # I wouldn't have to do this. Maybe add a document IDX for each node? @@ -555,7 +497,7 @@ module REXML def document_order( array_of_nodes ) new_arry = [] array_of_nodes.each { |node| - node_idx = [] + node_idx = [] np = node.node_type == :attribute ? node.element : node while np.parent and np.parent.node_type == :element node_idx << np.parent.index( np ) @@ -563,7 +505,6 @@ module REXML end new_arry << [ node_idx.reverse, node ] } - #puts "new_arry = #{new_arry.inspect}" new_arry.sort{ |s1, s2| s1[0] <=> s2[0] }.collect{ |s| s[1] } end @@ -579,10 +520,9 @@ module REXML # Builds a nodeset of all of the preceding nodes of the supplied node, # in reverse document order - # preceding:: includes every element in the document that precedes this node, + # preceding:: includes every element in the document that precedes this node, # except for ancestors def preceding( node ) - #puts "IN PRECEDING" ancestors = [] p = node.parent while p @@ -592,7 +532,6 @@ module REXML acc = [] p = preceding_node_of( node ) - #puts "P = #{p.inspect}" while p if ancestors.include? p ancestors.delete(p) @@ -600,18 +539,14 @@ module REXML acc << p end p = preceding_node_of( p ) - #puts "P = #{p.inspect}" end acc end def preceding_node_of( node ) - #puts "NODE: #{node.inspect}" - #puts "PREVIOUS NODE: #{node.previous_sibling_node.inspect}" - #puts "PARENT NODE: #{node.parent}" - psn = node.previous_sibling_node + psn = node.previous_sibling_node if psn.nil? - if node.parent.nil? or node.parent.class == Document + if node.parent.nil? or node.parent.class == Document return nil end return node.parent @@ -624,22 +559,16 @@ module REXML end def following( node ) - #puts "IN PRECEDING" acc = [] p = next_sibling_node( node ) - #puts "P = #{p.inspect}" while p acc << p p = following_node_of( p ) - #puts "P = #{p.inspect}" end acc end def following_node_of( node ) - #puts "NODE: #{node.inspect}" - #puts "PREVIOUS NODE: #{node.previous_sibling_node.inspect}" - #puts "PARENT NODE: #{node.parent}" if node.kind_of? Element and node.children.size > 0 return node.children[0] end @@ -647,14 +576,13 @@ module REXML end def next_sibling_node(node) - psn = node.next_sibling_node + psn = node.next_sibling_node while psn.nil? - if node.parent.nil? or node.parent.class == Document + if node.parent.nil? or node.parent.class == Document return nil end node = node.parent psn = node.next_sibling_node - #puts "psn = #{psn.inspect}" end return psn end @@ -673,22 +601,17 @@ module REXML end def equality_relational_compare( set1, op, set2 ) - #puts "EQ_REL_COMP(#{set1.inspect} #{op.inspect} #{set2.inspect})" if set1.kind_of? Array and set2.kind_of? Array - #puts "#{set1.size} & #{set2.size}" if set1.size == 1 and set2.size == 1 set1 = set1[0] set2 = set2[0] elsif set1.size == 0 or set2.size == 0 nd = set1.size==0 ? set2 : set1 rv = nd.collect { |il| compare( il, op, nil ) } - #puts "RV = #{rv.inspect}" return rv else res = [] - enum = SyncEnumerator.new( set1, set2 ).each { |i1, i2| - #puts "i1 = #{i1.inspect} (#{i1.class.name})" - #puts "i2 = #{i2.inspect} (#{i2.class.name})" + SyncEnumerator.new( set1, set2 ).each { |i1, i2| i1 = norm( i1 ) i2 = norm( i2 ) res << compare( i1, op, i2 ) @@ -696,8 +619,6 @@ module REXML return res end end - #puts "EQ_REL_COMP: #{set1.inspect} (#{set1.class.name}), #{op}, #{set2.inspect} (#{set2.class.name})" - #puts "COMPARING VALUES" # If one is nodeset and other is number, compare number to each item # in nodeset s.t. number op number(string(item)) # If one is nodeset and other is string, compare string to each item @@ -705,7 +626,6 @@ module REXML # If one is nodeset and other is boolean, compare boolean to each item # in nodeset s.t. boolean op boolean(item) if set1.kind_of? Array or set2.kind_of? Array - #puts "ISA ARRAY" if set1.kind_of? Array a = set1 b = set2 @@ -721,10 +641,8 @@ module REXML return a.collect {|v| compare( Functions::number(v), op, b )} when /^\d+(\.\d+)?$/ b = Functions::number( b ) - #puts "B = #{b.inspect}" return a.collect {|v| compare( Functions::number(v), op, b )} else - #puts "Functions::string( #{b}(#{b.class.name}) ) = #{Functions::string(b)}" b = Functions::string( b ) return a.collect { |v| compare( Functions::string(v), op, b ) } end @@ -738,10 +656,7 @@ module REXML # Convert both to numbers and compare s1 = set1.to_s s2 = set2.to_s - #puts "EQ_REL_COMP: #{set1}=>#{s1}, #{set2}=>#{s2}" if s1 == 'true' or s1 == 'false' or s2 == 'true' or s2 == 'false' - #puts "Functions::boolean(#{set1})=>#{Functions::boolean(set1)}" - #puts "Functions::boolean(#{set2})=>#{Functions::boolean(set2)}" set1 = Functions::boolean( set1 ) set2 = Functions::boolean( set2 ) else @@ -758,15 +673,12 @@ module REXML set2 = Functions::number( set2 ) end end - #puts "EQ_REL_COMP: #{set1} #{op} #{set2}" - #puts ">>> #{compare( set1, op, set2 )}" return compare( set1, op, set2 ) end return false end def compare a, op, b - #puts "COMPARE #{a.inspect}(#{a.class.name}) #{op} #{b.inspect}(#{b.class.name})" case op when :eq a == b |
