From ea7a527a2ae7024a5cf2885dee8f7a5c21fedd5d Mon Sep 17 00:00:00 2001 From: ser Date: Tue, 10 Jun 2003 01:31:01 +0000 Subject: Initial revision git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@3925 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- lib/rexml/attlistdecl.rb | 62 ++ lib/rexml/attribute.rb | 151 +++++ lib/rexml/cdata.rb | 68 ++ lib/rexml/child.rb | 96 +++ lib/rexml/comment.rb | 79 +++ lib/rexml/doctype.rb | 182 +++++ lib/rexml/document.rb | 237 +++++++ lib/rexml/dtd/attlistdecl.rb | 10 + lib/rexml/dtd/dtd.rb | 51 ++ lib/rexml/dtd/elementdecl.rb | 17 + lib/rexml/dtd/entitydecl.rb | 56 ++ lib/rexml/dtd/notationdecl.rb | 39 ++ lib/rexml/element.rb | 1147 ++++++++++++++++++++++++++++++++ lib/rexml/encoding.rb | 62 ++ lib/rexml/encodings/EUC-JP.rb | 17 + lib/rexml/encodings/EUC-JP_decl.rb | 6 + lib/rexml/encodings/ISO-8859-1.rb | 23 + lib/rexml/encodings/ISO-8859-1_decl.rb | 6 + lib/rexml/encodings/Shift-JIS.rb | 17 + lib/rexml/encodings/Shift-JIS_decl.rb | 6 + lib/rexml/encodings/Shift_JIS.rb | 17 + lib/rexml/encodings/UNILE.rb | 27 + lib/rexml/encodings/UNILE_decl.rb | 6 + lib/rexml/encodings/US-ASCII.rb | 23 + lib/rexml/encodings/US-ASCII_decl.rb | 6 + lib/rexml/encodings/UTF-16.rb | 27 + lib/rexml/encodings/UTF-16_decl.rb | 6 + lib/rexml/entity.rb | 159 +++++ lib/rexml/functions.rb | 360 ++++++++++ lib/rexml/instruction.rb | 62 ++ lib/rexml/light/node.rb | 232 +++++++ lib/rexml/namespace.rb | 47 ++ lib/rexml/node.rb | 35 + lib/rexml/output.rb | 22 + lib/rexml/parent.rb | 165 +++++ lib/rexml/parseexception.rb | 44 ++ lib/rexml/parsers/baseparser.rb | 391 +++++++++++ lib/rexml/parsers/lightparser.rb | 56 ++ lib/rexml/parsers/pullparser.rb | 143 ++++ lib/rexml/parsers/sax2parser.rb | 204 ++++++ lib/rexml/parsers/streamparser.rb | 33 + lib/rexml/parsers/ultralightparser.rb | 52 ++ lib/rexml/parsers/xpathparser.rb | 598 +++++++++++++++++ lib/rexml/quickpath.rb | 266 ++++++++ lib/rexml/rexml.rb | 26 + lib/rexml/sax2listener.rb | 94 +++ lib/rexml/source.rb | 191 ++++++ lib/rexml/streamlistener.rb | 89 +++ lib/rexml/text.rb | 279 ++++++++ lib/rexml/xmldecl.rb | 72 ++ lib/rexml/xmltokens.rb | 18 + lib/rexml/xpath.rb | 62 ++ lib/rexml/xpath_parser.rb | 530 +++++++++++++++ 53 files changed, 6674 insertions(+) create mode 100644 lib/rexml/attlistdecl.rb create mode 100644 lib/rexml/attribute.rb create mode 100644 lib/rexml/cdata.rb create mode 100644 lib/rexml/child.rb create mode 100644 lib/rexml/comment.rb create mode 100644 lib/rexml/doctype.rb create mode 100644 lib/rexml/document.rb create mode 100644 lib/rexml/dtd/attlistdecl.rb create mode 100644 lib/rexml/dtd/dtd.rb create mode 100644 lib/rexml/dtd/elementdecl.rb create mode 100644 lib/rexml/dtd/entitydecl.rb create mode 100644 lib/rexml/dtd/notationdecl.rb create mode 100644 lib/rexml/element.rb create mode 100644 lib/rexml/encoding.rb create mode 100644 lib/rexml/encodings/EUC-JP.rb create mode 100644 lib/rexml/encodings/EUC-JP_decl.rb create mode 100644 lib/rexml/encodings/ISO-8859-1.rb create mode 100644 lib/rexml/encodings/ISO-8859-1_decl.rb create mode 100644 lib/rexml/encodings/Shift-JIS.rb create mode 100644 lib/rexml/encodings/Shift-JIS_decl.rb create mode 100644 lib/rexml/encodings/Shift_JIS.rb create mode 100644 lib/rexml/encodings/UNILE.rb create mode 100644 lib/rexml/encodings/UNILE_decl.rb create mode 100644 lib/rexml/encodings/US-ASCII.rb create mode 100644 lib/rexml/encodings/US-ASCII_decl.rb create mode 100644 lib/rexml/encodings/UTF-16.rb create mode 100644 lib/rexml/encodings/UTF-16_decl.rb create mode 100644 lib/rexml/entity.rb create mode 100644 lib/rexml/functions.rb create mode 100644 lib/rexml/instruction.rb create mode 100644 lib/rexml/light/node.rb create mode 100644 lib/rexml/namespace.rb create mode 100644 lib/rexml/node.rb create mode 100644 lib/rexml/output.rb create mode 100644 lib/rexml/parent.rb create mode 100644 lib/rexml/parseexception.rb create mode 100644 lib/rexml/parsers/baseparser.rb create mode 100644 lib/rexml/parsers/lightparser.rb create mode 100644 lib/rexml/parsers/pullparser.rb create mode 100644 lib/rexml/parsers/sax2parser.rb create mode 100644 lib/rexml/parsers/streamparser.rb create mode 100644 lib/rexml/parsers/ultralightparser.rb create mode 100644 lib/rexml/parsers/xpathparser.rb create mode 100644 lib/rexml/quickpath.rb create mode 100644 lib/rexml/rexml.rb create mode 100644 lib/rexml/sax2listener.rb create mode 100644 lib/rexml/source.rb create mode 100644 lib/rexml/streamlistener.rb create mode 100644 lib/rexml/text.rb create mode 100644 lib/rexml/xmldecl.rb create mode 100644 lib/rexml/xmltokens.rb create mode 100644 lib/rexml/xpath.rb create mode 100644 lib/rexml/xpath_parser.rb (limited to 'lib') diff --git a/lib/rexml/attlistdecl.rb b/lib/rexml/attlistdecl.rb new file mode 100644 index 0000000000..d4b5c38af6 --- /dev/null +++ b/lib/rexml/attlistdecl.rb @@ -0,0 +1,62 @@ +#vim:ts=2 sw=2 noexpandtab: +require 'rexml/child' +require 'rexml/source' + +module REXML + # This class needs: + # * Documentation + # * Work! Not all types of attlists are intelligently parsed, so we just + # spew back out what we get in. This works, but it would be better if + # we formatted the output ourselves. + # + # AttlistDecls provide *just* enough support to allow namespace + # declarations. If you need some sort of generalized support, or have an + # interesting idea about how to map the hideous, terrible design of DTD + # AttlistDecls onto an intuitive Ruby interface, let me know. I'm desperate + # for anything to make DTDs more palateable. + class AttlistDecl < Child + include Enumerable + + # What is this? Got me. + attr_reader :element_name + + # Create an AttlistDecl, pulling the information from a Source. Notice + # that this isn't very convenient; to create an AttlistDecl, you basically + # have to format it yourself, and then have the initializer parse it. + # Sorry, but for the forseeable future, DTD support in REXML is pretty + # weak on convenience. Have I mentioned how much I hate DTDs? + def initialize(source) + super() + if (source.kind_of? Array) + @element_name, @pairs, @contents = *source + end + end + + # Access the attlist attribute/value pairs. + # value = attlist_decl[ attribute_name ] + def [](key) + @pairs[key] + end + + # Whether an attlist declaration includes the given attribute definition + # if attlist_decl.include? "xmlns:foobar" + def include?(key) + @pairs.keys.include? key + end + + # Itterate over the key/value pairs: + # attlist_decl.each { |attribute_name, attribute_value| ... } + def each(&block) + @pairs.each(&block) + end + + # Write out exactly what we got in. + def write out, indent=-1 + out << @contents + end + + def node_type + :attlistdecl + end + end +end diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb new file mode 100644 index 0000000000..4aef0944dc --- /dev/null +++ b/lib/rexml/attribute.rb @@ -0,0 +1,151 @@ +require "rexml/namespace" +require 'rexml/text' + +module REXML + # Defines an Element Attribute; IE, a attribute=value pair, as in: + # . Attributes can be in their own + # namespaces. General users of REXML will not interact with the + # Attribute class much. + class Attribute + include Node + include Namespace + + # The element to which this attribute belongs + attr_reader :element + # The normalized value of this attribute. That is, the attribute with + # entities intact. + attr_writer :normalized + PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + + # Constructor. + # + # Attribute.new( attribute_to_clone ) + # Attribute.new( source ) + # Attribute.new( "attr", "attr_value" ) + # Attribute.new( "attr", "attr_value", parent_element ) + def initialize( first, second=nil, parent=nil ) + @normalized = @unnormalized = @element = nil + if first.kind_of? Attribute + self.name = first.expanded_name + @value = first.value + if second.kind_of? Element + @element = second + else + @element = first.element + end + elsif first.kind_of? String + @element = parent if parent.kind_of? Element + self.name = first + @value = second + else + raise "illegal argument #{first.type} to Attribute constructor" + end + end + + # Returns the namespace of the attribute. + # + # e = Element.new( "elns:myelement" ) + # e.add_attribute( "nsa:a", "aval" ) + # e.add_attribute( "b", "bval" ) + # e.attributes.get_attribute( "a" ).prefix # -> "nsa" + # e.attributes.get_attribute( "b" ).prefix # -> "elns" + # a = Attribute.new( "x", "y" ) + # a.prefix # -> "" + def prefix + pf = super + if pf == "" + pf = @element.prefix if @element + end + pf + end + + # Returns the namespace URL, if defined, or nil otherwise + # + # e = Element.new("el") + # e.add_attributes({"xmlns:ns", "http://url"}) + # e.namespace( "ns" ) # -> "http://url" + def namespace arg=nil + arg = prefix if arg.nil? + @element.namespace arg + end + + # Returns true if other is an Attribute and has the same name and value, + # false otherwise. + def ==( other ) + other.kind_of?(Attribute) and other.name==name and other.value==@value + end + + # Creates (and returns) a hash from both the name and value + def hash + name.hash + value.hash + end + + # Returns this attribute out as XML source, expanding the name + # + # a = Attribute.new( "x", "y" ) + # a.to_string # -> "x='y'" + # b = Attribute.new( "ns:x", "y" ) + # b.to_string # -> "ns:x='y'" + def to_string + "#@expanded_name='#{to_s().gsub(/'/, ''')}'" + end + + # Returns the attribute value, with entities replaced + def to_s + return @normalized if @normalized + + doctype = nil + if @element + doc = @element.document + doctype = doc.doctype if doc + end + + @unnormalized = nil + @value = @normalized = Text::normalize( @value, doctype ) + end + + # Returns the UNNORMALIZED value of this attribute. That is, entities + # have been expanded to their values + def value + @unnormalized if @unnormalized + doctype = nil + if @element + doc = @element.document + doctype = doc.doctype if doc + end + @normalized = nil + @value = @unnormalized = Text::unnormalize( @value, doctype ) + end + + # Returns a copy of this attribute + def clone + Attribute.new self + end + + # Sets the element of which this object is an attribute. Normally, this + # is not directly called. + # + # Returns this attribute + def element=( element ) + @element = element + self + end + + # Removes this Attribute from the tree, and returns true if successfull + # + # This method is usually not called directly. + def remove + @element.attributes.delete self.name unless @element.nil? + end + + # Writes this attribute (EG, puts 'key="value"' to the output) + def write( output, indent=-1 ) + output << to_string + end + + def node_type + :attribute + end + end +end +#vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb new file mode 100644 index 0000000000..a5e071b377 --- /dev/null +++ b/lib/rexml/cdata.rb @@ -0,0 +1,68 @@ +require "rexml/text" + +module REXML + class CData < Text + START = '' + ILLEGAL = /(]]>)/ + + # Constructor. CData is data between + # + # _Examples_ + # CData.new( source ) + # CData.new( "Here is some CDATA" ) + # CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element ) + def initialize( first, whitespace=true, parent=nil ) + super( first, whitespace, parent, true, true, ILLEGAL ) + end + + # Make a copy of this object + # + # _Examples_ + # c = CData.new( "Some text" ) + # d = c.clone + # d.to_s # -> "Some text" + def clone + CData.new self + end + + # Returns the content of this CData object + # + # _Examples_ + # c = CData.new( "Some text" ) + # c.to_s # -> "Some text" + def to_s + @string + end + + # Generates XML output of this object + # + # output:: + # Where to write the string. Defaults to $stdout + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1. + # transitive:: + # If transitive is true and indent is >= 0, then the output will be + # pretty-printed in such a way that the added whitespace does not affect + # the absolute *value* of the document -- that is, it leaves the value + # and number of Text nodes in the document unchanged. + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. + # + # _Examples_ + # c = CData.new( " Some text " ) + # c.write( $stdout ) #-> + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + indent( output, indent ) + output << START + output << @string + output << STOP + end + end +end diff --git a/lib/rexml/child.rb b/lib/rexml/child.rb new file mode 100644 index 0000000000..6d3c9df5e6 --- /dev/null +++ b/lib/rexml/child.rb @@ -0,0 +1,96 @@ +require "rexml/node" + +module REXML + ## + # A Child object is something contained by a parent, and this class + # contains methods to support that. Most user code will not use this + # class directly. + class Child + include Node + attr_reader :parent # The Parent of this object + + # Constructor. Any inheritors of this class should call super to make + # sure this method is called. + # parent:: + # if supplied, the parent of this child will be set to the + # supplied value, and self will be added to the parent + def initialize( parent = nil ) + @parent = nil + # Declare @parent, but don't define it. The next line sets the + # parent. + parent.add( self ) if parent + end + + # Replaces this object with another object. Basically, calls + # Parent.replace_child + # + # Returns:: self + def replace_with( child ) + @parent.replace_child( self, child ) + self + end + + # Removes this child from the parent. + # + # Returns:: self + def remove + unless @parent.nil? + @parent.delete self + end + self + end + + # Sets the parent of this child to the supplied argument. + # + # other:: + # Must be a Parent object. If this object is the same object as the + # existing parent of this child, no action is taken. Otherwise, this + # child is removed from the current parent (if one exists), and is added + # to the new parent. + # Returns:: The parent added + def parent=( other ) + return @parent if @parent == other + @parent.delete self if defined? @parent and @parent + @parent = other + end + + alias :next_sibling :next_sibling_node + alias :previous_sibling :previous_sibling_node + + # Sets the next sibling of this child. This can be used to insert a child + # after some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.next_sibling = c + # # => + def next_sibling=( other ) + parent.insert_after self, other + end + + # Sets the previous sibling of this child. This can be used to insert a + # child before some other child. + # a = Element.new("a") + # b = a.add_element("b") + # c = Element.new("c") + # b.previous_sibling = c + # # => + def previous_sibling=(other) + parent.insert_before self, other + end + + # Returns:: the document this child belongs to, or nil if this child + # belongs to no document + def document + return parent.document unless parent.nil? + nil + end + + # This doesn't yet handle encodings + def bytes + encoding = document.encoding + + to_s + end + end +end diff --git a/lib/rexml/comment.rb b/lib/rexml/comment.rb new file mode 100644 index 0000000000..e439ddf9d8 --- /dev/null +++ b/lib/rexml/comment.rb @@ -0,0 +1,79 @@ +require "rexml/child" + +module REXML + ## + # Represents an XML comment; that is, text between + class Comment < Child + include Comparable + START = "" + + attr_accessor :string # The content text + + ## + # Constructor. The first argument can be one of three types: + # @param first If String, the contents of this comment are set to the + # argument. If Comment, the argument is duplicated. If + # Source, the argument is scanned for a comment. + # @param second If the first argument is a Source, this argument + # should be nil, not supplied, or a Parent to be set as the parent + # of this object + def initialize( first, second = nil ) + #puts "IN COMMENT CONSTRUCTOR; SECOND IS #{second.type}" + super(second) + if first.kind_of? String + @string = first + elsif first.kind_of? Comment + @string = first.string + end + end + + def clone + Comment.new self + end + + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # Who knows? + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. + # + def write( output, indent=-1, transitive=false, ie_hack=false ) + indent( output, indent ) + output << START + output << @string + output << STOP + end + + alias :to_s :string + + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def <=>(other) + other.to_s <=> @string + end + + ## + # Compares this Comment to another; the contents of the comment are used + # in the comparison. + def ==( other ) + other.kind_of? Comment and + (other <=> self) == 0 + end + + def node_type + :comment + end + end +end +#vim:ts=2 sw=2 noexpandtab: diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb new file mode 100644 index 0000000000..d70ea6fd6c --- /dev/null +++ b/lib/rexml/doctype.rb @@ -0,0 +1,182 @@ +require "rexml/parent" +require "rexml/parseexception" +require "rexml/namespace" +require 'rexml/entity' +require 'rexml/attlistdecl' +require 'rexml/xmltokens' + +module REXML + # Represents an XML DOCTYPE declaration; that is, the contents of . DOCTYPES can be used to declare the DTD of a document, as well as + # being used to declare entities used in the document. + class DocType < Parent + include XMLTokens + START = "" + SYSTEM = "SYSTEM" + PUBLIC = "PUBLIC" + DEFAULT_ENTITIES = { + 'gt'=>EntityConst::GT, + 'lt'=>EntityConst::LT, + 'quot'=>EntityConst::QUOT, + "apos"=>EntityConst::APOS + } + + # name is the name of the doctype + # external_id is the referenced DTD, if given + attr_reader :name, :external_id, :entities, :namespaces + + # Constructor + # + # dt = DocType.new( 'foo', '-//I/Hate/External/IDs' ) + # # + # dt = DocType.new( doctype_to_clone ) + # # Incomplete. Shallow clone of doctype + # source = Source.new( '' ) + # dt = DocType.new( source ) + # # + # dt = DocType.new( source, some_document ) + # # Creates a doctype, and adds to the supplied document + def initialize( first, parent=nil ) + @entities = DEFAULT_ENTITIES + @long_name = @uri = nil + if first.kind_of? String + super() + @name = first + @external_id = parent + elsif first.kind_of? DocType + super( parent ) + @name = first.name + @external_id = first.external_id + elsif first.kind_of? Array + super( parent ) + @name = first[0] + @external_id = first[1] + @long_name = first[2] + @uri = first[3] + end + end + + def node_type + :doctype + end + + def attributes_of element + rv = [] + each do |child| + child.each do |key,val| + rv << Attribute.new(key,val) + end if child.kind_of? AttlistDecl and child.element_name == element + end + rv + end + + def attribute_of element, attribute + att_decl = find do |child| + child.kind_of? AttlistDecl and + child.element_name == element and + child.include? attribute + end + return nil unless att_decl + att_decl[attribute] + end + + def clone + DocType.new self + end + + # output:: + # Where to write the string + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. + # transitive:: + # Who knows? + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. + # + def write( output, indent=0, transitive=false, ie_hack=false ) + indent( output, indent ) + output << START + output << ' ' + output << @name + output << " #@external_id" if @external_id + output << " #@long_name" if @long_name + output << " #@uri" if @uri + unless @children.empty? + next_indent = indent + 2 + output << ' [' + child = nil # speed + @children.each { |child| + output << "\n" + child.write( output, next_indent ) + } + output << "\n" + #output << ' '*next_indent + output << "]" + end + output << STOP + end + + def entity( name ) + @entities[name].unnormalized if @entities[name] + end + + def add child + super(child) + @entities = DEFAULT_ENTITIES.clone if @entities == DEFAULT_ENTITIES + @entities[ child.name ] = child if child.kind_of? Entity + end + end + + # We don't really handle any of these since we're not a validating + # parser, so we can be pretty dumb about them. All we need to be able + # to do is spew them back out on a write() + + # This is an abstract class. You never use this directly; it serves as a + # parent class for the specific declarations. + class Declaration < Child + def initialize src + super() + @string = src + end + + def to_s + @string+'>' + end + + def write( output, indent ) + output << (' '*indent) if indent > 0 + output << to_s + end + end + + public + class ElementDecl < Declaration + def initialize( src ) + super + end + end + + class NotationDecl < Child + def initialize name, middle, rest + @name = name + @middle = middle + @rest = rest + end + + def to_s + "" + end + + def write( output, indent=-1 ) + output << (' '*indent) if indent > 0 + output << to_s + end + end +end diff --git a/lib/rexml/document.rb b/lib/rexml/document.rb new file mode 100644 index 0000000000..8617f904e6 --- /dev/null +++ b/lib/rexml/document.rb @@ -0,0 +1,237 @@ +require "rexml/element" +require "rexml/xmldecl" +require "rexml/source" +require "rexml/comment" +require "rexml/doctype" +require "rexml/instruction" +require "rexml/rexml" +require "rexml/parseexception" +require "rexml/output" +require "rexml/parsers/baseparser" +require "rexml/parsers/streamparser" + +module REXML + # Represents a full XML document, including PIs, a doctype, etc. A + # Document has a single child that can be accessed by root(). + # Note that if you want to have an XML declaration written for a document + # you create, you must add one; REXML documents do not write a default + # declaration for you. See |DECLARATION| and |write|. + class Document < Element + # A convenient default XML declaration. If you want an XML declaration, + # the easiest way to add one is mydoc << Document::DECLARATION + DECLARATION = XMLDecl.new( "1.0", "UTF-8" ) + + # Constructor + # @param source if supplied, must be a Document, String, or IO. + # Documents have their context and Element attributes cloned. + # Strings are expected to be valid XML documents. IOs are expected + # to be sources of valid XML documents. + # @param context if supplied, contains the context of the document; + # this should be a Hash. + # NOTE that I'm not sure what the context is for; I cloned it out of + # the Electric XML API (in which it also seems to do nothing), and it + # is now legacy. It may do something, someday... it may disappear. + def initialize( source = nil, context = {} ) + super() + @context = context + return if source.nil? + if source.kind_of? Document + @context = source.context + super source + else + build( source ) + end + end + + def node_type + :document + end + + # Should be obvious + def clone + Document.new self + end + + # According to the XML spec, a root node has no expanded name + def expanded_name + '' + #d = doc_type + #d ? d.name : "UNDEFINED" + end + + alias :name :expanded_name + + # We override this, because XMLDecls and DocTypes must go at the start + # of the document + def add( child ) + if child.kind_of? XMLDecl + @children.unshift child + elsif child.kind_of? DocType + if @children[0].kind_of? XMLDecl + @children[1,0] = child + else + @children.unshift child + end + child.parent = self + else + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + end + alias :<< :add + + def add_element(arg=nil, arg2=nil) + rv = super + raise "attempted adding second root element to document" if @elements.size > 1 + rv + end + + # @return the root Element of the document, or nil if this document + # has no children. + def root + @children.find { |item| item.kind_of? Element } + end + + # @return the DocType child of the document, if one exists, + # and nil otherwise. + def doctype + @children.find { |item| item.kind_of? DocType } + end + + # @return the XMLDecl of this document; if no XMLDecl has been + # set, the default declaration is returned. + def xml_decl + rv = @children.find { |item| item.kind_of? XMLDecl } + rv = DECLARATION if rv.nil? + rv + end + + # @return the XMLDecl version of this document as a String. + # If no XMLDecl has been set, returns the default version. + def version + decl = xml_decl() + decl.nil? ? XMLDecl.DEFAULT_VERSION : decl.version + end + + # @return the XMLDecl encoding of this document as a String. + # If no XMLDecl has been set, returns the default encoding. + def encoding + decl = xml_decl() + decl.nil? or decl.encoding.nil? ? XMLDecl.DEFAULT_ENCODING : decl.encoding + end + + # @return the XMLDecl standalone value of this document as a String. + # If no XMLDecl has been set, returns the default setting. + def stand_alone? + decl = xml_decl() + decl.nil? ? XMLDecl.DEFAULT_STANDALONE : decl.stand_alone? + end + + # Write the XML tree out, optionally with indent. This writes out the + # entire XML document, including XML declarations, doctype declarations, + # and processing instructions (if any are given). + # A controversial point is whether Document should always write the XML + # declaration () whether or not one is given by the + # user (or source document). REXML does not write one if one was not + # specified, because it adds unneccessary bandwidth to applications such + # as XML-RPC. + # + # + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1 + # transitive:: + # What the heck does this do? Defaults to false + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. Defaults to false + def write( output=$stdout, indent=-1, transitive=false, ie_hack=false ) + output = Output.new( output, xml_decl.encoding ) if xml_decl.encoding != "UTF-8" + @children.each { |node| + node.write( output, indent, transitive, ie_hack ) + output << "\n" unless indent<0 or node == @children[-1] + } + end + + + def Document::parse_stream( source, listener ) + Parsers::StreamParser.new( source, listener ).parse + end + + private + def build( source ) + build_context = self + parser = Parsers::BaseParser.new( source ) + tag_stack = [] + in_doctype = false + entities = nil + while true + event = parser.pull + case event[0] + when :end_document + return + when :start_element + tag_stack.push(event[1]) + # find the observers for namespaces + build_context = build_context.add_element( event[1], event[2] ) + when :end_element + tag_stack.pop + build_context = build_context.parent + when :text + if not in_doctype + if build_context[-1].instance_of? Text + build_context[-1] << event[1] + else + build_context.add( + Text.new( event[1], true, nil, true ) + ) unless ( + event[1].strip.size == 0 and + build_context.ignore_whitespace_nodes + ) + end + end + when :comment + c = Comment.new( event[1] ) + build_context.add( c ) + when :cdata + c = CData.new( event[1] ) + build_context.add( c ) + when :processing_instruction + build_context.add( Instruction.new( event[1], event[2] ) ) + when :end_doctype + in_doctype = false + entities.each { |k,v| entities[k] = build_context.entities[k].value } + build_context = build_context.parent + when :start_doctype + doctype = DocType.new( event[1..-1], build_context ) + build_context = doctype + entities = {} + in_doctype = true + when :attlistdecl + n = AttlistDecl.new( event[1..-1] ) + build_context.add( n ) + when :elementdecl + n = ElementDecl.new( event[1] ) + build_context.add(n) + when :entitydecl + entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + build_context.add(Entity.new(event)) + when :notationdecl + n = NotationDecl.new( *event[1..-1] ) + build_context.add( n ) + when :xmldecl + x = XMLDecl.new( event[1], event[2], event[3] ) + build_context.add( x ) + end + end + end + end +end diff --git a/lib/rexml/dtd/attlistdecl.rb b/lib/rexml/dtd/attlistdecl.rb new file mode 100644 index 0000000000..e176bb0749 --- /dev/null +++ b/lib/rexml/dtd/attlistdecl.rb @@ -0,0 +1,10 @@ +require "rexml/child" +module REXML + module DTD + class AttlistDecl < Child + START = ")/um + end + end +end diff --git a/lib/rexml/dtd/dtd.rb b/lib/rexml/dtd/dtd.rb new file mode 100644 index 0000000000..81119cfa9b --- /dev/null +++ b/lib/rexml/dtd/dtd.rb @@ -0,0 +1,51 @@ +require "rexml/dtd/elementdecl" +require "rexml/dtd/entitydecl" +require "rexml/comment" +require "rexml/dtd/notationdecl" +require "rexml/dtd/attlistdecl" +require "rexml/parent" + +module REXML + module DTD + class Parser + def Parser.parse( input ) + case input + when String + parse_helper input + when File + parse_helper input.read + end + end + + # Takes a String and parses it out + def Parser.parse_helper( input ) + contents = Parent.new + while input.size > 0 + case input + when ElementDecl.PATTERN_RE + match = $& + source = $' + contents << EleemntDecl.new( match ) + when AttlistDecl.PATTERN_RE + matchdata = $~ + source = $' + contents << AttlistDecl.new( matchdata ) + when EntityDecl.PATTERN_RE + matchdata = $~ + source = $' + contents << EntityDecl.new( matchdata ) + when Comment.PATTERN_RE + matchdata = $~ + source = $' + contents << Comment.new( matchdata ) + when NotationDecl.PATTERN_RE + matchdata = $~ + source = $' + contents << NotationDecl.new( matchdata ) + end + end + contents + end + end + end +end diff --git a/lib/rexml/dtd/elementdecl.rb b/lib/rexml/dtd/elementdecl.rb new file mode 100644 index 0000000000..c4e620f389 --- /dev/null +++ b/lib/rexml/dtd/elementdecl.rb @@ -0,0 +1,17 @@ +require "rexml/child" +module REXML + module DTD + class ElementDecl < Child + START = "/um + PATTERN_RE = /^\s*#{START}\s+((?:[:\w_][-\.\w_]*:)?[-!\*\.\w_]*)(.*?)>/ + #\s*((((["']).*?\5)|[^\/'">]*)*?)(\/)?>/um, true) + + def initialize match + @name = match[1] + @rest = match[2] + end + end + end +end diff --git a/lib/rexml/dtd/entitydecl.rb b/lib/rexml/dtd/entitydecl.rb new file mode 100644 index 0000000000..83156dfc71 --- /dev/null +++ b/lib/rexml/dtd/entitydecl.rb @@ -0,0 +1,56 @@ +require "rexml/child" +module REXML + module DTD + class EntityDecl < Child + START = "/um + SYSTEM = /^\s*#{START}\s+(?:%\s+)?(\w+)\s+SYSTEM\s+((["']).*?\3)(?:\s+NDATA\s+\w+)?\s*>/um + PLAIN = /^\s*#{START}\s+(\w+)\s+((["']).*?\3)\s*>/um + PERCENT = /^\s*#{START}\s+%\s+(\w+)\s+((["']).*?\3)\s*>/um + # + # + def initialize src + super() + md = nil + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + @middle = "PUBLIC" + @content = "#{md[2]} #{md[4]}" + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + @middle = "SYSTEM" + @content = md[2] + elsif src.match( PLAIN ) + md = src.match( PLAIN, true ) + @middle = "" + @content = md[2] + elsif src.match( PERCENT ) + md = src.match( PERCENT, true ) + @middle = "" + @content = md[2] + end + raise ParseException.new("failed Entity match", src) if md.nil? + @name = md[1] + end + + def to_s + rv = " 0 + rv << @content + rv + end + + def write( output, indent ) + output << (' '*indent) if indent > 0 + output << to_s + end + + def EntityDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze " \t\n\r" + listener.send inspect.downcase, thing + end + end + end +end diff --git a/lib/rexml/dtd/notationdecl.rb b/lib/rexml/dtd/notationdecl.rb new file mode 100644 index 0000000000..09b6743c5c --- /dev/null +++ b/lib/rexml/dtd/notationdecl.rb @@ -0,0 +1,39 @@ +require "rexml/child" +module REXML + module DTD + class NotationDecl < Child + START = "/um + SYSTEM = /^\s*#{START}\s+(\w[\w-]*)\s+(SYSTEM)\s+((["']).*?\4)\s*>/um + def initialize src + super() + if src.match( PUBLIC ) + md = src.match( PUBLIC, true ) + elsif src.match( SYSTEM ) + md = src.match( SYSTEM, true ) + else + raise ParseException.new( "error parsing notation: no matching pattern", src ) + end + @name = md[1] + @middle = md[2] + @rest = md[3] + end + + def to_s + "" + end + + def write( output, indent ) + output << (' '*indent) if indent > 0 + output << to_s + end + + def NotationDecl.parse_source source, listener + md = source.match( PATTERN_RE, true ) + thing = md[0].squeeze " \t\n\r" + listener.send inspect.downcase, thing + end + end + end +end diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb new file mode 100644 index 0000000000..89e419345c --- /dev/null +++ b/lib/rexml/element.rb @@ -0,0 +1,1147 @@ +require "rexml/parent" +require "rexml/namespace" +require "rexml/attribute" +require "rexml/cdata" +require "rexml/xpath" +require "rexml/parseexception" + +module REXML + # Represents a tagged XML element. Elements are characterized by + # having children, attributes, and names, and can themselves be + # children. + class Element < Parent + include Namespace + + UNDEFINED = "UNDEFINED"; # The default name + + # Mechanisms for accessing attributes and child elements of this + # element. + attr_reader :attributes, :elements + # The context holds information about the processing environment, such as + # whitespace handling. + attr_accessor :context + + # Constructor + # arg:: + # if not supplied, will be set to the default value. + # If a String, the name of this object will be set to the argument. + # If an Element, the object will be shallowly cloned; name, + # attributes, and namespaces will be copied. Children will +not+ be + # copied. + # If a Source, the source will be scanned and parsed for an Element, + # and all child elements will be recursively parsed as well. + # parent:: + # if supplied, must be a Parent, and will be used as + # the parent of this object. + # context:: + # If supplied, must be a hash containing context items. Context items + # include: + # * :respect_whitespace the value of this is :+all+ or an array of + # strings being the names of the elements to respect + # whitespace for. Defaults to :+all+. + # * :compress_whitespace the value can be :+all+ or an array of + # strings being the names of the elements to ignore whitespace on. + # Overrides :+respect_whitespace+. + # * :ignore_whitespace_nodes the value can be :+all+ or an array + # of strings being the names of the elements in which to ignore + # whitespace-only nodes. If this is set, Text nodes which contain only + # whitespace will not be added to the document tree. + # * :raw can be :+all+, or an array of strings being the names of + # the elements to process in raw mode. In raw mode, special + # characters in text is not converted to or from entities. + def initialize( arg = UNDEFINED, parent=nil, context=nil ) + super(parent) + + @elements = Elements.new self + @attributes = Attributes.new self + @context = context + + if arg.kind_of? String + self.name = arg + elsif arg.kind_of? Element + self.name = arg.expanded_name + arg.attributes.each_attribute{ |attribute| + @attributes << Attribute.new( attribute ) + } + @context = arg.context + end + end + + # Creates a shallow copy of self. + # d = Document.new "" + # new_a = d.root.clone + # puts new_a # => "" + def clone + Element.new self + end + + # Evaluates to the root element of the document that this element + # belongs to. If this element doesn't belong to a document, but does + # belong to another Element, the parent's root will be returned, until the + # earliest ancestor is found. + # d = Document.new '' + # a = d[1] ; c = a[1][1] + # d.root # These all evaluate to the same Element, + # a.root # namely, + # c.root # + def root + parent.nil? ? self : parent.root + end + + # Evaluates to the document to which this element belongs, or nil if this + # element doesn't belong to a document. + def document + root.parent if root + end + + # Evaluates to +true+ if whitespace is respected for this element. This + # is the case if: + # 1. Neither :+respect_whitespace+ nor :+compress_whitespace+ has any value + # 2. The context has :+respect_whitespace+ set to :+all+ or + # an array containing the name of this element, and :+compress_whitespace+ + # isn't set to :+all+ or an array containing the name of this element. + # The evaluation is tested against +expanded_name+, and so is namespace + # sensitive. + def whitespace + @whitespace = nil + if @context + if @context[:respect_whitespace] + @whitespace = (@context[:respect_whitespace] == :all or + @context[:respect_whitespace].include? expanded_name) + end + @whitespace = false if (@context[:compress_whitespace] and + (@context[:compress_whitespace] == :all or + @context[:compress_whitespace].include? expanded_name) + ) + end + @whitespace = true unless @whitespace == false + @whitespace + end + + def ignore_whitespace_nodes + @ignore_whitespace_nodes = false + if @context + if @context[:ignore_whitespace_nodes] + @ignore_whitespace_nodes = + (@context[:ignore_whitespace_nodes] == :all or + @context[:ignore_whitespace_nodes].include? expanded_name) + end + end + end + + # Evaluates to +true+ if raw mode is set for this element. This + # is the case if the context has :+raw+ set to :+all+ or + # an array containing the name of this element. + # + # The evaluation is tested against +expanded_name+, and so is namespace + # sensitive. + def raw + @raw = (@context and @context[:raw] and + (@context[:raw] == :all or + @context[:raw].include? expanded_name)) + @raw + end + + #once :whitespace, :raw, :ignore_whitespace_nodes + + ################################################# + # Namespaces # + ################################################# + + # Evaluates to an +Array+ containing the prefixes (names) of all defined + # namespaces at this context node. + # doc = Document.new("") + # doc.elements['//b'].prefixes # -> ['x', 'y'] + def prefixes + prefixes = [] + prefixes = parent.prefixes if parent + prefixes |= attributes.prefixes + return prefixes + end + + def namespaces + namespaces = [] + namespaces = parent.namespaces if parent + namespaces |= attributes.namespaces + return namespaces + end + + # Evalutas to the URI for a prefix, or the empty string if no such + # namespace is declared for this element. Evaluates recursively for + # ancestors. Returns the default namespace, if there is one. + # prefix:: + # the prefix to search for. If not supplied, returns the default + # namespace if one exists + # Returns:: + # the namespace URI as a String, or nil if no such namespace + # exists. If the namespace is undefined, returns an empty string + # doc = Document.new("") + # b = doc.elements['//b'] + # b.namespace # -> '1' + # b.namespace("y") # -> '2' + def namespace(prefix=nil) + if prefix.nil? + prefix = prefix() + end + if prefix == '' + prefix = "xmlns" + else + prefix = "xmlns:#{prefix}" unless prefix[0,5] == 'xmlns' + end + ns = attributes[ prefix ] + ns = parent.namespace(prefix) if ns.nil? and parent + ns = '' if ns.nil? and prefix == 'xmlns' + return ns + end + + # Adds a namespace to this element. + # prefix:: + # the prefix string, or the namespace URI if +uri+ is not + # supplied + # uri:: + # the namespace URI. May be nil, in which +prefix+ is used as + # the URI + # Evaluates to: this Element + # a = Element.new("a") + # a.add_namespace("xmlns:foo", "bar" ) + # a.add_namespace("foo", "bar") # shorthand for previous line + # a.add_namespace("twiddle") + # puts a #-> + def add_namespace( prefix, uri=nil ) + unless uri + @attributes["xmlns"] = prefix + else + prefix = "xmlns:#{prefix}" unless prefix =~ /^xmlns:/ + @attributes[ prefix ] = uri + end + self + end + + # Removes a namespace from this node. This only works if the namespace is + # actually declared in this node. If no argument is passed, deletes the + # default namespace. + # + # Evaluates to: this element + # doc = Document.new "" + # doc.root.delete_namespace + # puts doc # -> + # doc.root.delete_namespace 'foo' + # puts doc # -> + def delete_namespace namespace="xmlns" + namespace = "xmlns:#{namespace}" unless namespace == 'xmlns' + attribute = attributes.get_attribute(namespace) + attribute.remove unless attribute.nil? + self + end + + ################################################# + # Elements # + ################################################# + + # Adds a child to this element, optionally setting attributes in + # the element. + # element:: + # optional. If Element, the element is added. + # Otherwise, a new Element is constructed with the argument (see + # Element.initialize). + # attrs:: + # If supplied, must be a Hash containing String name,value + # pairs, which will be used to set the attributes of the new Element. + # Returns:: the Element that was added + # el = doc.add_element 'my-tag' + # el = doc.add_element 'my-tag', {'attr1'=>'val1', 'attr2'=>'val2'} + # el = Element.new 'my-tag' + # doc.add_element el + def add_element element=nil, attrs=nil + el = @elements.add element + if attrs.kind_of? Hash + attrs.each do |key, value| + el.attributes[key]=value if key =~ /^xmlns:/ + end + attrs.each do |key, value| + el.attributes[key]=value if key !~ /^xmlns:/ + end + end + el + end + + # Deletes a child element. + # element:: + # Must be an +Element+, +String+, or +Integer+. If Element, + # the element is removed. If String, the element is found (via XPath) + # and removed. This means that any parent can remove any + # descendant. If Integer, the Element indexed by that number will be + # removed. + # Returns:: the element that was removed. + # doc.delete_element "/a/b/c[@id='4']" + # doc.delete_element doc.elements["//k"] + # doc.delete_element 1 + def delete_element element + @elements.delete element + end + + # Evaluates to +true+ if this element has at least one child Element + # doc = Document.new "Text" + # doc.root.has_elements # -> true + # doc.elements["/a/b"].has_elements # -> false + # doc.elements["/a/c"].has_elements # -> false + def has_elements? + !@elements.empty? + end + + # Iterates through the child elements, yielding for each Element that + # has a particular attribute set. + # key:: + # the name of the attribute to search for + # value:: + # the value of the attribute + # max:: + # (optional) causes this method to return after yielding + # for this number of matching children + # name:: + # (optional) if supplied, this is an XPath that filters + # the children to check. + # + # doc = Document.new "" + # # Yields b, c, d + # doc.root.each_element_with_attribute( 'id' ) {|e| p e} + # # Yields b, d + # doc.root.each_element_with_attribute( 'id', '1' ) {|e| p e} + # # Yields b + # doc.root.each_element_with_attribute( 'id', '1', 1 ) {|e| p e} + # # Yields d + # doc.root.each_element_with_attribute( 'id', '1', 0, 'd' ) {|e| p e} + def each_element_with_attribute( key, value=nil, max=0, name=nil, &block ) # :yields: Element + each_with_something( proc {|child| + if value.nil? + child.attributes[key] != nil + else + child.attributes[key]==value + end + }, max, name, &block ) + end + + # Iterates through the children, yielding for each Element that + # has a particular text set. + # text:: + # the text to search for. If nil, or not supplied, will itterate + # over all +Element+ children that contain at least one +Text+ node. + # max:: + # (optional) causes this method to return after yielding + # for this number of matching children + # name:: + # (optional) if supplied, this is an XPath that filters + # the children to check. + # + # doc = Document.new 'bbd' + # # Yields b, c, d + # doc.each_element_with_text {|e|p e} + # # Yields b, c + # doc.each_element_with_text('b'){|e|p e} + # # Yields b + # doc.each_element_with_text('b', 1){|e|p e} + # # Yields d + # doc.each_element_with_text(nil, 0, 'd'){|e|p e} + def each_element_with_text( text=nil, max=0, name=nil, &block ) # :yields: Element + each_with_something( proc {|child| + if text.nil? + child.has_text? + else + child.text == text + end + }, max, name, &block ) + end + + # Synonym for Element.elements.each + def each_element( xpath=nil, &block ) # :yields: Element + @elements.each( xpath, &block ) + end + + # Synonym for Element.to_a + # This is a little slower than calling elements.each directly. + # xpath:: any XPath by which to search for elements in the tree + # Returns:: an array of Elements that match the supplied path + def get_elements( xpath ) + @elements.to_a( xpath ) + end + + # Returns the next sibling that is an element, or nil if there is + # no Element sibling after this one + # doc = Document.new 'text' + # doc.root.elements['b'].next_element #-> + # doc.root.elements['c'].next_element #-> nil + def next_element + element = next_sibling + element = element.next_sibling until element.nil? or element.kind_of? Element + return element + end + + # Returns the previous sibling that is an element, or nil if there is + # no Element sibling prior to this one + # doc = Document.new 'text' + # doc.root.elements['c'].previous_element #-> + # doc.root.elements['b'].previous_element #-> nil + def previous_element + element = previous_sibling + element = element.previous_sibling until element.nil? or element.kind_of? Element + return element + end + + + ################################################# + # Text # + ################################################# + + # Evaluates to +true+ if this element has at least one Text child + def has_text? + not text().nil? + end + + # A convenience method which returns the String value of the _first_ + # child text element, if one exists, and +nil+ otherwise. + # + # Note that an element may have multiple Text elements, perhaps + # separated by other children. Be aware that this method only returns + # the first Text node. + # + # This method returns the +value+ of the first text child node, which + # ignores the +raw+ setting, so always returns normalized text. See + # the Text::value documentation. + # + # doc = Document.new "

some text this is bold! more text

" + # # The element 'p' has two text elements, "some text " and " more text". + # doc.root.text #-> "some text " + def text( path = nil ) + rv = get_text path + return rv.value unless rv.nil? + nil + end + + # Returns the first child Text node, if any, or +nil+ otherwise. + # This method returns the actual +Text+ node, rather than the String content. + # doc = Document.new "

some text this is bold! more text

" + # # The element 'p' has two text elements, "some text " and " more text". + # doc.root.get_text.value #-> "some text " + def get_text path = nil + rv = nil + if path + element = @elements[ path ] + rv = element.get_text unless element.nil? + else + rv = find { |node| node.kind_of? Text } + end + return rv + end + + # Sets the first Text child of this object. See text() for a + # discussion about Text children. + # + # If a Text child already exists, the child is replaced by this + # content. This means that Text content can be deleted by calling + # this method with a nil argument. In this case, the next Text + # child becomes the first Text child. In no case is the order of + # any siblings disturbed. + # text:: + # If a String, a new Text child is created and added to + # this Element as the first Text child. If Text, the text is set + # as the first Child element. If nil, then any existing first Text + # child is removed. + # Returns:: this Element. + # doc = Document.new '' + # doc.root.text = 'Sean' #-> 'Sean' + # doc.root.text = 'Elliott' #-> 'Elliott' + # doc.root.add_element 'c' #-> 'Elliott' + # doc.root.text = 'Russell' #-> 'Russell' + # doc.root.text = nil #-> '' + def text=( text ) + text = Text.new( text, whitespace(), nil, raw() ) if text.kind_of? String + old_text = get_text + if text.nil? + old_text.remove unless old_text.nil? + else + if old_text.nil? + self << text + else + old_text.replace_with( text ) + end + end + return self + end + + # A helper method to add a Text child. Actual Text instances can + # be added with regular Parent methods, such as add() and <<() + # text:: + # if a String, a new Text instance is created and added + # to the parent. If Text, the object is added directly. + # Returns:: this Element + # e = Element.new('a') #-> + # e.add_text 'foo' #-> foo + # e.add_text Text.new(' bar') #-> foo bar + # Note that at the end of this example, the branch has 3 nodes; the 'e' + # element and 2 Text node children. + def add_text( text ) + if text.kind_of? String + if @children[-1].kind_of? Text + @children[-1] << text + return + end + text = Text.new( text, whitespace(), nil, raw() ) + end + self << text unless text.nil? + return self + end + + def node_type + :element + end + + ################################################# + # Attributes # + ################################################# + + def attribute( name, namespace=nil ) + prefix = '' + if namespace + prefix = attributes.prefixes.each { |prefix| + return "#{prefix}:" if namespace( prefix ) == namespace + } || '' + end + attributes.get_attribute( "#{prefix}#{name}" ) + end + + # Evaluates to +true+ if this element has any attributes set, false + # otherwise. + def has_attributes? + return !@attributes.empty? + end + + # Adds an attribute to this element, overwriting any existing attribute + # by the same name. + # key:: + # can be either an Attribute or a String. If an Attribute, + # the attribute is added to the list of Element attributes. If String, + # the argument is used as the name of the new attribute, and the value + # parameter must be supplied. + # value:: + # Required if +key+ is a String, and ignored if the first argument is + # an Attribute. This is a String, and is used as the value + # of the new Attribute. + # Returns:: the Attribute added + # e = Element.new 'e' + # e.add_attribute( 'a', 'b' ) #-> + # e.add_attribute( 'x:a', 'c' ) #-> + # e.add_attribute Attribute.new('b', 'd') #-> + def add_attribute( key, value=nil ) + if key.kind_of? Attribute + @attributes << key + else + @attributes[key] = value + end + end + + # Add multiple attributes to this element. + # hash:: is either a hash, or array of arrays + # el.add_attributes( {"name1"=>"value1", "name2"=>"value2"} ) + # el.add_attributes( [ ["name1","value1"], ["name2"=>"value2"] ] ) + def add_attributes hash + if hash.kind_of? Hash + hash.each_pair {|key, value| @attributes[key] = value } + elsif hash.kind_of? Array + hash.each { |value| @attributes[ value[0] ] = value[1] } + end + end + + # Removes an attribute + # key:: + # either an Attribute or a String. In either case, the + # attribute is found by matching the attribute name to the argument, + # and then removed. If no attribute is found, no action is taken. + # Returns:: + # the attribute removed, or nil if this Element did not contain + # a matching attribute + # e = Element.new('E') + # e.add_attribute( 'name', 'Sean' ) #-> + # r = e.add_attribute( 'sur:name', 'Russell' ) #-> + # e.delete_attribute( 'name' ) #-> + # e.delete_attribute( r ) #-> + def delete_attribute(key) + attr = @attributes.get_attribute(key) + attr.remove unless attr.nil? + end + + ################################################# + # Other Utilities # + ################################################# + + # Get an array of all CData children. + # IMMUTABLE + def cdatas + find_all { |child| child.kind_of? CData }.freeze + end + + # Get an array of all Comment children. + # IMMUTABLE + def comments + find_all { |child| child.kind_of? Comment }.freeze + end + + # Get an array of all Instruction children. + # IMMUTABLE + def instructions + find_all { |child| child.kind_of? Instruction }.freeze + end + + # Get an array of all Text children. + # IMMUTABLE + def texts + find_all { |child| child.kind_of? Text }.freeze + end + + # Writes out this element, and recursively, all children. + # output:: + # output an object which supports '<< string'; this is where the + # document will be written. + # indent:: + # An integer. If -1, no indenting will be used; otherwise, the + # indentation will be this number of spaces, and children will be + # indented an additional amount. Defaults to -1 + # transitive:: + # What the heck does this do? Defaults to false + # ie_hack:: + # Internet Explorer is the worst piece of crap to have ever been + # written, with the possible exception of Windows itself. Since IE is + # unable to parse proper XML, we have to provide a hack to generate XML + # that IE's limited abilities can handle. This hack inserts a space + # before the /> on empty tags. Defaults to false + # + # out = '' + # doc.write( out ) #-> doc is written to the string 'out' + # doc.write( $stdout ) #-> doc written to the console + def write(writer=$stdout, indent=-1, transitive=false, ie_hack=false) + #print "ID:#{indent}" + writer << "<#@expanded_name" + + @attributes.each_attribute do |attr| + writer << " " + attr.write( writer, indent ) + end unless @attributes.empty? + + if @children.empty? + writer << " " if ie_hack + writer << "/" + else + if transitive and indent>-1 and !@children[0].kind_of? Text + writer << "\n" + indent writer, indent+2 + end + writer << ">" + write_children( writer, indent, transitive, ie_hack ) + writer << "-1 + writer << "\n" + indent -= 2 if next_sibling.nil? + indent(writer, indent) + end + writer << ">" + end + + + private + # A private helper method + def each_with_something( test, max=0, name=nil ) + num = 0 + child=nil + @elements.each( name ){ |child| + yield child if test.call(child) and num += 1 + return if max>0 and num == max + } + end + + # A private helper method + def write_children( writer, indent, transitive, ie_hack ) + cr = (indent < 0) ? '' : "\n" + #if size == 1 and @children[0].kind_of?(Text) + # self[0].write( writer, -1 ) + if indent == -1 + each { |child| child.write( writer, indent, transitive, ie_hack ) } + else + next_indent = indent+2 + last_child=nil + each { |child| + unless child.kind_of? Text or last_child.kind_of? Text or transitive + writer << cr + indent(writer, next_indent) + end + child.write( writer, next_indent, transitive, ie_hack ) + last_child = child + } + unless last_child.kind_of? Text or transitive + writer << cr + indent( writer, indent ) + end + end + end + end + + ######################################################################## + # ELEMENTS # + ######################################################################## + + # A class which provides filtering of children for Elements, and + # XPath search support. You are expected to only encounter this class as + # the element.elements object. Therefore, you are + # _not_ expected to instantiate this yourself. + class Elements + include Enumerable + # Constructor + # parent:: the parent Element + def initialize parent + @element = parent + end + + # Fetches a child element. Filters only Element children, regardless of + # the XPath match. + # index:: + # the search parameter. This is either an Integer, which + # will be used to find the index'th child Element, or an XPath, + # which will be used to search for the Element. Because + # of the nature of XPath searches, any element in the connected XML + # document can be fetched through any other element. The + # Integer index is 1-based, not 0-based. This means that the first + # child element is at index 1, not 0, and the +n+th element is at index + # +n+, not n-1. This is because XPath indexes element children + # starting from 1, not 0, and the indexes should be the same. + # name:: + # optional, and only used in the first argument is an + # Integer. In that case, the index'th child Element that has the + # supplied name will be returned. Note again that the indexes start at 1. + # Returns:: the first matching Element, or nil if no child matched + # doc = Document.new '' + # doc.root.elements[1] #-> + # doc.root.elements['c'] #-> + # doc.root.elements[2,'c'] #-> + def []( index, name=nil) + if index.kind_of? Integer + raise "index (#{index}) must be >= 1" if index < 1 + name = literalize name if name + num = 0 + child = nil + @element.find { |child| + child.kind_of? Element and + (name.nil? ? true : child.has_name?( name )) and + (num += 1) == index + } + else + return XPath::first( @element, index ) + #{ |element| + # return element if element.kind_of? Element + #} + #return nil + end + end + + # Sets an element, replacing any previous matching element. If no + # existing element is found ,the element is added. + # index:: Used to find a matching element to replace. See [](). + # element:: + # The element to replace the existing element with + # the previous element + # Returns:: nil if no previous element was found. + # + # doc = Document.new '' + # doc.root.elements[10] = Element.new('b') #-> + # doc.root.elements[1] #-> + # doc.root.elements[1] = Element.new('c') #-> + # doc.root.elements['c'] = Element.new('d') #-> + def []=( index, element ) + previous = self[index] + if previous.nil? + @element.add element + else + previous.replace_with element + end + return previous + end + + # Returns +true+ if there are no +Element+ children, +false+ otherwise + def empty? + @element.find{ |child| child.kind_of? Element}.nil? + end + + # Returns the index of the supplied child (starting at 1), or -1 if + # the element is not a child + # element:: an +Element+ child + def index element + rv = 0 + found = @element.find do |child| + child.kind_of? Element and + (rv += 1) and + child == element + end + return rv if found == element + return -1 + end + + # Deletes a child Element + # element:: + # Either an Element, which is removed directly; an + # xpath, where the first matching child is removed; or an Integer, + # where the n'th Element is removed. + # Returns:: the removed child + # doc = Document.new '' + # b = doc.root.elements[1] + # doc.root.elements.delete b #-> + # doc.elements.delete("a/c[@id='1']") #-> + # doc.root.elements.delete 1 #-> + def delete element + if element.kind_of? Element + @element.delete element + else + el = self[element] + el.remove if el + end + end + + # Removes multiple elements. Filters for Element children, regardless of + # XPath matching. + # xpath:: all elements matching this String path are removed. + # Returns:: an Array of Elements that have been removed + # doc = Document.new '' + # deleted = doc.elements.delete_all 'a/c' #-> [, , , ] + def delete_all( xpath ) + rv = [] + XPath::each( @element, xpath) {|element| + rv << element if element.kind_of? Element + } + rv.each do |element| + @element.delete element + element.remove + end + return rv + end + + # Adds an element + # element:: + # if supplied, is either an Element, String, or + # Source (see Element.initialize). If not supplied or nil, a + # new, default Element will be constructed + # Returns:: the added Element + # a = Element.new 'a' + # a.elements.add Element.new 'b' #-> + # a.elements.add 'c' #-> + def add element=nil + rv = nil + if element.nil? + Element.new "", self, @element.context + elsif not element.kind_of?(Element) + Element.new element, self, @element.context + else + @element << element + element.context = @element.context + element + end + end + + alias :<< :add + + # Iterates through all of the child Elements, optionally filtering + # them by a given XPath + # xpath:: + # optional. If supplied, this is a String XPath, and is used to + # filter the children, so that only matching children are yielded. Note + # that XPaths are automatically filtered for Elements, so that + # non-Element children will not be yielded + # doc = Document.new 'sean' + # doc.root.each {|e|p e} #-> Yields b, c, d, b, c, d elements + # doc.root.each('b') {|e|p e} #-> Yields b, b elements + # doc.root.each('child::node()') {|e|p e} + # #-> Yields , , , , , + # XPath.each(doc.root, 'child::node()', &block) + # #-> Yields , , , sean, , , + def each( xpath=nil, &block) + XPath::each( @element, xpath ) {|e| yield e if e.kind_of? Element } + end + + # Returns the number of +Element+ children of the parent object. + # doc = Document.new 'seanelliottrussell' + # doc.root.size #-> 6, 3 element and 3 text nodes + # doc.root.elements.size #-> 3 + def size + count = 0 + @element.each {|child| count+=1 if child.kind_of? Element } + count + end + + # Returns an Array of Element children. An XPath may be supplied to + # filter the children. Only Element children are returned, even if the + # supplied XPath matches non-Element children. + # doc = Document.new 'seanelliott' + # doc.root.elements.to_a #-> [ , ] + # doc.root.elements.to_a("child::node()") #-> [ , ] + # XPath.match(doc.root, "child::node()") #-> [ sean, , elliott, ] + def to_a( xpath=nil ) + rv = XPath.match( @element, xpath ) + return rv.find_all{|e| e.kind_of? Element} if xpath + rv + end + + private + # Private helper class. Removes quotes from quoted strings + def literalize name + name = name[1..-2] if name[0] == ?' or name[0] == ?" #' + name + end + end + + ######################################################################## + # ATTRIBUTES # + ######################################################################## + + # A class that defines the set of Attributes of an Element and provides + # operations for accessing elements in that set. + class Attributes < Hash + # Constructor + # element:: the Element of which this is an Attribute + def initialize element + @element = element + end + + # Fetches an attribute value. If you want to get the Attribute itself, + # use get_attribute() + # name:: an XPath attribute name. Namespaces are relevant here. + # Returns:: + # the String value of the matching attribute, or +nil+ if no + # matching attribute was found. + # + # doc = Document.new "" + # doc.root.attributes['att'] #-> '3' + # doc.root.attributes['bar:att'] #-> '2' + def [](name) + attr = get_attribute(name) + return attr.value unless attr.nil? + return nil + end + + # Returns the number of attributes the owning Element contains. + # doc = Document "" + # doc.root.attributes.length #-> 3 + def length + c = 0 + each_attribute { c+=1 } + c + end + alias :size :length + + # Itterates over the attributes of an Element. Yields actual Attribute + # nodes, not String values. + # + # doc = Document.new '' + # doc.root.attributes.each_attribute {|attr| + # p attr.expanded_name+" => "+attr.value + # } + def each_attribute # :yields: attribute + each_value do |val| + if val.kind_of? Attribute + yield val + else + val.each_value { |atr| yield atr } + end + end + end + + # Itterates over each attribute of an Element, yielding the expanded name + # and value as a pair of Strings. + # + # doc = Document.new '' + # doc.root.attributes.each {|name, value| p name+" => "+value } + def each + each_attribute do |attr| + yield attr.expanded_name, attr.value + end + end + + # Fetches an attribute + # name:: + # the name by which to search for the attribute. Can be a + # prefix:name namespace name. + # Returns:: The first matching attribute, or nil if there was none. This + # value is an Attribute node, not the String value of the attribute. + # doc = Document.new '' + # doc.root.attributes.get_attribute("foo").value #-> "2" + # doc.root.attributes.get_attribute("x:foo").value #-> "1" + def get_attribute( name ) + attr = fetch( name, nil ) + if attr.nil? + return nil if name.nil? + # Look for prefix + name =~ Namespace::NAMESPLIT + prefix, n = $1, $2 + if prefix + attr = fetch( n, nil ) + # check prefix + if attr == nil + elsif attr.kind_of? Attribute + return attr if prefix == attr.prefix + else + attr = attr[ prefix ] + return attr + end + end + if @element.document and @element.document.doctype + expn = @element.expanded_name + expn = @element.document.doctype.name if expn.size == 0 + attr_val = @element.document.doctype.attribute_of(expn, name) + return Attribute.new( name, attr_val ) if attr_val + end + return nil + end + if attr.kind_of? Hash + attr = attr[ @element.prefix ] + end + return attr + end + + # Sets an attribute, overwriting any existing attribute value by the + # same name. Namespace is significant. + # name:: the name of the attribute + # value:: + # (optional) If supplied, the value of the attribute. If + # nil, any existing matching attribute is deleted. + # Returns:: + # Owning element + # doc = Document.new "" + # doc.root.attributes['y:foo'] = '2' + # doc.root.attributes['foo'] = '4' + # doc.root.attributes['x:foo'] = nil + def []=( name, value ) + if value.nil? # Delete the named attribute + attr = get_attribute name + delete attr + return + end + value = Attribute.new(name, value) unless value.kind_of? Attribute + value.element = @element + old_attr = fetch value.name, nil + if old_attr.nil? + store(value.name, value) + elsif old_attr.kind_of? Hash + old_attr[value.prefix] = value + elsif old_attr.prefix != value.prefix + # Check for conflicting namespaces + raise ParseException.new( + "Namespace conflict in adding attribute \"#{value.name}\": "+ + "Prefix \"#{old_attr.prefix}\" = "+ + "\"#{@element.namespace(old_attr.prefix)}\" and prefix "+ + "\"#{value.prefix}\" = \"#{@element.namespace(value.prefix)}\"") if + value.prefix != "xmlns" and old_attr.prefix != "xmlns" and + @element.namespace( old_attr.prefix ) == + @element.namespace( value.prefix ) + store value.name, { old_attr.prefix => old_attr, + value.prefix => value } + else + store value.name, value + end + return @element + end + + # Returns an array of Strings containing all of the prefixes declared + # by this set of # attributes. The array does not include the default + # namespace declaration, if one exists. + # doc = Document.new("") + # prefixes = doc.root.attributes.prefixes #-> ['x', 'y'] + def prefixes + ns = [] + each_attribute do |attribute| + ns << attribute.name if attribute.prefix == 'xmlns' + end + if @element.document and @element.document.doctype + expn = @element.expanded_name + expn = @element.document.doctype.name if expn.size == 0 + @element.document.doctype.attributes_of(expn).each { + |attribute| + ns << attribute.name if attribute.prefix == 'xmlns' + } + end + ns + end + + def namespaces + namespaces = [] + each_attribute do |attribute| + namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + end + if @element.document and @element.document.doctype + expn = @element.expanded_name + expn = @element.document.doctype.name if expn.size == 0 + @element.document.doctype.attributes_of(expn).each { + |attribute| + namespaces << attribute.value if attribute.prefix == 'xmlns' or attribute.name == 'xmlns' + } + end + namespaces + end + + # Removes an attribute + # attribute:: + # either a String, which is the name of the attribute to remove -- + # namespaces are significant here -- or the attribute to remove. + # Returns:: the owning element + # doc = Document.new "" + # doc.root.attributes.delete 'foo' #-> " + # doc.root.attributes.delete 'x:foo' #-> " + # attr = doc.root.attributes.get_attribute('y:foo') + # doc.root.attributes.delete attr #-> " + def delete( attribute ) + name = nil + prefix = nil + if attribute.kind_of? Attribute + name = attribute.name + prefix = attribute.prefix + else + attribute =~ Namespace::NAMESPLIT + prefix, name = $1, $2 + prefix = '' unless prefix + end + old = fetch name, nil + attr = nil + if old.kind_of? Hash # the supplied attribute is one of many + attr = old.delete(prefix) + if old.size == 1 + repl = nil + old.each_value{|v| repl = v} + store name, repl + end + elsif old.nil? + return @element + else # the supplied attribute is a top-level one + attr = old + res = super(name) + end + @element + end + + # Adds an attribute, overriding any existing attribute by the + # same name. Namespaces are significant. + # attribute:: An Attribute + def add( attribute ) + self[attribute.name] = attribute + end + + alias :<< :add + + # Deletes all attributes matching a name. Namespaces are significant. + # name:: + # A String; all attributes that match this path will be removed + # Returns:: an Array of the Attributes that were removed + def delete_all( name ) + rv = [] + each_attribute { |attribute| + rv << attribute if attribute.expanded_name == name + } + rv.each{ |attr| attr.remove } + return rv + end + end +end diff --git a/lib/rexml/encoding.rb b/lib/rexml/encoding.rb new file mode 100644 index 0000000000..3d7dcd6260 --- /dev/null +++ b/lib/rexml/encoding.rb @@ -0,0 +1,62 @@ +module REXML + module Encoding + @@uconv_available = false + + ENCODING_CLAIMS = { } + + def Encoding.claim( encoding_str, match=nil ) + if match + ENCODING_CLAIMS[ match ] = encoding_str + else + ENCODING_CLAIMS[ /^\s* Encoding name + attr_reader :encoding + def encoding=( enc ) + enc = UTF_8 unless enc + @encoding = enc.upcase + require "rexml/encodings/#@encoding" unless @encoding == UTF_8 + end + + def check_encoding str + rv = ENCODING_CLAIMS.find{|k,v| str =~ k } + # Raise an exception if there is a declared encoding and we don't + # recognize it + unless rv + if str =~ /^\s* 0 + encodings |= Dir[ File.join(incl_dir, 'rexml', 'encodings', '*_decl.rb') ] + end + encodings.collect!{ |f| File.basename(f) } + encodings.uniq! + end + encodings.each { |enc| require "rexml/encodings/#{enc}" } + end +end diff --git a/lib/rexml/encodings/EUC-JP.rb b/lib/rexml/encodings/EUC-JP.rb new file mode 100644 index 0000000000..cedd6751e7 --- /dev/null +++ b/lib/rexml/encodings/EUC-JP.rb @@ -0,0 +1,17 @@ +begin + require 'uconv' + + module REXML + module Encoding + def from_euc_jp(str) + return Uconv::euctou8(str) + end + + def to_euc_jp content + return Uconv::u8toeuc(content) + end + end + end +rescue LoadError + raise "uconv is required for Japanese encoding support." +end diff --git a/lib/rexml/encodings/EUC-JP_decl.rb b/lib/rexml/encodings/EUC-JP_decl.rb new file mode 100644 index 0000000000..4c7cd828a6 --- /dev/null +++ b/lib/rexml/encodings/EUC-JP_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + EUC_JP = 'EUC-JP' + claim( EUC_JP ) + end +end diff --git a/lib/rexml/encodings/ISO-8859-1.rb b/lib/rexml/encodings/ISO-8859-1.rb new file mode 100644 index 0000000000..98c5aff3b2 --- /dev/null +++ b/lib/rexml/encodings/ISO-8859-1.rb @@ -0,0 +1,23 @@ +module REXML + module Encoding + # Convert from UTF-8 + def to_iso_8859_1 content + array_utf8 = content.unpack('U*') + array_enc = [] + array_utf8.each do |num| + if num <= 0xFF + array_enc << num + else + # Numeric entity (&#nnnn;); shard by Stefan Scholl + array_enc.concat "&\##{num};".unpack('C*') + end + end + array_enc.pack('C*') + end + + # Convert to UTF-8 + def from_iso_8859_1(str) + str.unpack('C*').pack('U*') + end + end +end diff --git a/lib/rexml/encodings/ISO-8859-1_decl.rb b/lib/rexml/encodings/ISO-8859-1_decl.rb new file mode 100644 index 0000000000..a738d30472 --- /dev/null +++ b/lib/rexml/encodings/ISO-8859-1_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + ISO_8859_1 = 'ISO-8859-1' + claim( ISO_8859_1 ) + end +end diff --git a/lib/rexml/encodings/Shift-JIS.rb b/lib/rexml/encodings/Shift-JIS.rb new file mode 100644 index 0000000000..8650174538 --- /dev/null +++ b/lib/rexml/encodings/Shift-JIS.rb @@ -0,0 +1,17 @@ +begin + require 'uconv' + + module REXML + module Encoding + def to_shift_jis content + Uconv::u8tosjis(content) + end + + def from_shift_jis(str) + Uconv::sjistou8(str) + end + end + end +rescue LoadError + raise "uconv is required for Japanese encoding support." +end diff --git a/lib/rexml/encodings/Shift-JIS_decl.rb b/lib/rexml/encodings/Shift-JIS_decl.rb new file mode 100644 index 0000000000..66f650144a --- /dev/null +++ b/lib/rexml/encodings/Shift-JIS_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + claim( 'Shift-JIS' ) + claim( 'Shift_JIS' ) + end +end diff --git a/lib/rexml/encodings/Shift_JIS.rb b/lib/rexml/encodings/Shift_JIS.rb new file mode 100644 index 0000000000..8650174538 --- /dev/null +++ b/lib/rexml/encodings/Shift_JIS.rb @@ -0,0 +1,17 @@ +begin + require 'uconv' + + module REXML + module Encoding + def to_shift_jis content + Uconv::u8tosjis(content) + end + + def from_shift_jis(str) + Uconv::sjistou8(str) + end + end + end +rescue LoadError + raise "uconv is required for Japanese encoding support." +end diff --git a/lib/rexml/encodings/UNILE.rb b/lib/rexml/encodings/UNILE.rb new file mode 100644 index 0000000000..74bed14340 --- /dev/null +++ b/lib/rexml/encodings/UNILE.rb @@ -0,0 +1,27 @@ +module REXML + module Encoding + def to_unile content + array_utf8 = content.unpack("U*") + array_enc = [] + array_utf8.each do |num| + if ((num>>16) > 0) + array_enc << ?? + array_enc << 0 + else + array_enc << (num & 0xFF) + array_enc << (num >> 8) + end + end + array_enc.pack('C*') + end + + def from_unile(str) + array_enc=str.unpack('C*') + array_utf8 = [] + 2.step(array_enc.size-1, 2){|i| + array_utf8 << (array_enc.at(i) + array_enc.at(i+1)*0x100) + } + array_utf8.pack('U*') + end + end +end diff --git a/lib/rexml/encodings/UNILE_decl.rb b/lib/rexml/encodings/UNILE_decl.rb new file mode 100644 index 0000000000..9e1c11dc03 --- /dev/null +++ b/lib/rexml/encodings/UNILE_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + UNILE = 'UNILE' + claim( UNILE, /^\377\376/ ) + end +end diff --git a/lib/rexml/encodings/US-ASCII.rb b/lib/rexml/encodings/US-ASCII.rb new file mode 100644 index 0000000000..4ca2c82a83 --- /dev/null +++ b/lib/rexml/encodings/US-ASCII.rb @@ -0,0 +1,23 @@ +module REXML + module Encoding + # Convert from UTF-8 + def to_us_ascii content + array_utf8 = content.unpack('U*') + array_enc = [] + array_utf8.each do |num| + if num <= 0xFF + array_enc << num + else + # Numeric entity (&#nnnn;); shard by Stefan Scholl + array_enc.concat "&\##{num};".unpack('C*') + end + end + array_enc.pack('C*') + end + + # Convert to UTF-8 + def from_us_ascii(str) + str.unpack('C*').pack('U*') + end + end +end diff --git a/lib/rexml/encodings/US-ASCII_decl.rb b/lib/rexml/encodings/US-ASCII_decl.rb new file mode 100644 index 0000000000..1e69234fff --- /dev/null +++ b/lib/rexml/encodings/US-ASCII_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + US_ASCII = 'US-ASCII' + claim( US_ASCII ) + end +end diff --git a/lib/rexml/encodings/UTF-16.rb b/lib/rexml/encodings/UTF-16.rb new file mode 100644 index 0000000000..2aeef76a0c --- /dev/null +++ b/lib/rexml/encodings/UTF-16.rb @@ -0,0 +1,27 @@ +module REXML + module Encoding + def to_utf_16 content + array_utf8 = content.unpack("U*") + array_enc = [] + array_utf8.each do |num| + if ((num>>16) > 0) + array_enc << 0 + array_enc << ?? + else + array_enc << (num >> 8) + array_enc << (num & 0xFF) + end + end + array_enc.pack('C*') + end + + def from_utf_16(str) + array_enc=str.unpack('C*') + array_utf8 = [] + 2.step(arrayEnc.size-1, 2){|i| + array_utf8 << (array_enc.at(i+1) + array_enc.at(i)*0x100) + } + array_utf8.pack('U*') + end + end +end diff --git a/lib/rexml/encodings/UTF-16_decl.rb b/lib/rexml/encodings/UTF-16_decl.rb new file mode 100644 index 0000000000..f405a9f259 --- /dev/null +++ b/lib/rexml/encodings/UTF-16_decl.rb @@ -0,0 +1,6 @@ +module REXML + module Encoding + UTF_16 = 'UTF-16' + claim( UTF_16, /^\376\377/ ) + end +end diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb new file mode 100644 index 0000000000..4b88a3c553 --- /dev/null +++ b/lib/rexml/entity.rb @@ -0,0 +1,159 @@ +require 'rexml/child' +require 'rexml/source' +require 'rexml/xmltokens' + +module REXML + # God, I hate DTDs. I really do. Why this idiot standard still + # plagues us is beyond me. + class Entity < Child + include XMLTokens + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "" + GEDECL = "" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + + attr_reader :name, :external, :ref, :ndata, :pubid + + # Create a new entity. Simple entities can be constructed by passing a + # name, value to the constructor; this creates a generic, plain entity + # reference. For anything more complicated, you have to pass a Source to + # the constructor with the entity definiton, or use the accessor methods. + # +WARNING+: There is no validation of entity state except when the entity + # is read from a stream. If you start poking around with the accessors, + # you can easily create a non-conformant Entity. The best thing to do is + # dump the stupid DTDs and use XMLSchema instead. + # + # e = Entity.new( 'amp', '&' ) + def initialize stream, value=nil, parent=nil, reference=false + super(parent) + @ndata = @pubid = @value = @external = nil + if stream.kind_of? Array + @name = stream[1] + if stream[-1] == '%' + @reference = true + stream.pop + else + @reference = false + end + if stream[2] =~ /SYSTEM|PUBLIC/ + @external = stream[2] + if @external == 'SYSTEM' + @ref = stream[3] + @ndata = stream[4] if stream.size == 5 + else + @pubid = stream[3] + @ref = stream[4] + end + else + @value = stream[2] + end + else + @reference = reference + @external = nil + @name = stream + @value = value + end + end + + # Evaluates whether the given string matchs an entity definition, + # returning true if so, and false otherwise. + def Entity::matches? string + (ENTITYDECL =~ string) == 0 + end + + # Evaluates to the unnormalized value of this entity; that is, replacing + # all entities -- both %ent; and &ent; entities. This differs from + # +value()+ in that +value+ only replaces %ent; entities. + def unnormalized + v = value() + return nil if v.nil? + @unnormalized = Text::unnormalize(v, parent) + @unnormalized + end + + #once :unnormalized + + # Returns the value of this entity unprocessed -- raw. This is the + # normalized value; that is, with all %ent; and &ent; entities intact + def normalized + @value + end + + # Write out a fully formed, correct entity definition (assuming the Entity + # object itself is valid.) + def write out, indent=-1 + out << '' + end + + # Returns this entity as a string. See write(). + def to_s + rv = '' + write rv + rv + end + + PEREFERENCE_RE = /#{PEREFERENCE}/um + # Returns the value of this entity. At the moment, only internal entities + # are processed. If the value contains internal references (IE, + # %blah;), those are replaced with their values. IE, if the doctype + # contains: + # + # + # then: + # doctype.entity('yada').value #-> "nanoo bar nanoo" + def value + if @value + matches = @value.scan(PEREFERENCE_RE) + rv = @value.clone + if @parent + matches.each do |entity_reference| + entity_value = @parent.entity( entity_reference[0] ) + rv.gsub!( /%#{entity_reference};/um, entity_value ) + end + end + return rv + end + nil + end + end + + # This is a set of entity constants -- the ones defined in the XML + # specification. These are +gt+, +lt+, +amp+, +quot+ and +apos+. + module EntityConst + # +>+ + GT = Entity.new( 'gt', '>' ) + # +<+ + LT = Entity.new( 'lt', '<' ) + # +&+ + AMP = Entity.new( 'amp', '&' ) + # +"+ + QUOT = Entity.new( 'quot', '"' ) + # +'+ + APOS = Entity.new( 'apos', "'" ) + end +end diff --git a/lib/rexml/functions.rb b/lib/rexml/functions.rb new file mode 100644 index 0000000000..d2d078640b --- /dev/null +++ b/lib/rexml/functions.rb @@ -0,0 +1,360 @@ +module REXML + # If you add a method, keep in mind two things: + # (1) the first argument will always be a list of nodes from which to + # filter. In the case of context methods (such as position), the function + # should return an array with a value for each child in the array. + # (2) all method calls from XML will have "-" replaced with "_". + # Therefore, in XML, "local-name()" is identical (and actually becomes) + # "local_name()" + module Functions + @@node = nil + @@index = nil + @@size = nil + @@variables = {} + @@namespace_context = {} + + def Functions::node=(value); @@node = value; end + def Functions::index=(value); @@index = value; end + def Functions::size=(value); @@size = value; end + def Functions::variables=(value); @@variables = value; end + def Functions::namespace_context=(value) + @@namespace_context = value + end + def Functions::node; @@node; end + def Functions::index; @@index; end + def Functions::size; @@size; end + def Functions::variables; @@variables; end + def Functions::namespace_context; @@namespace_context; end + + def Functions::text( ) + return true if @@node.node_type == :text + end + + def Functions::last( ) + @@size + end + + def Functions::position( ) + @@index + end + + def Functions::count( node_set ) + node_set.size + end + + # Since REXML is non-validating, this method is not implemented as it + # requires a DTD + def Functions::id( object ) + end + + # UNTESTED + def Functions::local_name( node_set=nil ) + get_namespace( node_set ) do |node| + return node.local_name + end + end + + def Functions::namespace_uri( node_set=nil ) + get_namespace( node_set ) {|node| node.namespace} + end + + def Functions::name( node_set=nil ) + get_namespace( node_set ) do |node| + node.expanded_name + end + end + + # Helper method. + def Functions::get_namespace( node_set = nil ) + if node_set == nil + yield @@node if defined? @@node.namespace + else + if node_set.namespace + yield node_set + else + return unless node_set.kind_of? Enumerable + node_set.each { |node| yield node if defined? node.namespace } + end + end + end + + # A node-set is converted to a string by returning the string-value of the + # node in the node-set that is first in document order. If the node-set is + # empty, an empty string is returned. + # + # A number is converted to a string as follows + # + # NaN is converted to the string NaN + # + # positive zero is converted to the string 0 + # + # negative zero is converted to the string 0 + # + # positive infinity is converted to the string Infinity + # + # negative infinity is converted to the string -Infinity + # + # if the number is an integer, the number is represented in decimal form + # as a Number with no decimal point and no leading zeros, preceded by a + # minus sign (-) if the number is negative + # + # otherwise, the number is represented in decimal form as a Number + # including a decimal point with at least one digit before the decimal + # point and at least one digit after the decimal point, preceded by a + # minus sign (-) if the number is negative; there must be no leading zeros + # before the decimal point apart possibly from the one required digit + # immediately before the decimal point; beyond the one required digit + # after the decimal point there must be as many, but only as many, more + # digits as are needed to uniquely distinguish the number from all other + # IEEE 754 numeric values. + # + # The boolean false value is converted to the string false. The boolean + # true value is converted to the string true. + # + # An object of a type other than the four basic types is converted to a + # string in a way that is dependent on that type. + def Functions::string( object=nil ) + #object = @context unless object + if object.instance_of? Array + string( object[0] ) + elsif defined? object.node_type + if object.node_type == :attribute + object.value + elsif object.node_type == :element + object.text + else + object.to_s + end + else + object.to_s + end + end + + # UNTESTED + def Functions::concat( *objects ) + objects.join + end + + # Fixed by Mike Stok + def Functions::starts_with( string, test ) + string(string).index(string(test)) == 0 + end + + # Fixed by Mike Stok + def Functions::contains( string, test ) + string(string).include? string(test) + end + + # Kouhei fixed this + def Functions::substring_before( string, test ) + ruby_string = string(string) + ruby_index = ruby_string.index(string(test)) + if ruby_index.nil? + "" + else + ruby_string[ 0...ruby_index ] + end + end + + # Kouhei fixed this too + def Functions::substring_after( string, test ) + ruby_string = string(string) + ruby_index = ruby_string.index(string(test)) + if ruby_index.nil? + "" + else + ruby_string[ ruby_index+1..-1 ] + end + end + + # Take equal portions of Mike Stok and Sean Russell; mix + # vigorously, and pour into a tall, chilled glass. Serves 10,000. + def Functions::substring( string, start, length=nil ) + ruby_string = string(string) + ruby_length = if length.nil? + ruby_string.length.to_f + else + number(length) + end + ruby_start = number(start) + + # Handle the special cases + return '' if ( + ruby_length.nan? or + ruby_start.nan? or + ruby_start.infinite? + ) + + infinite_length = ruby_length.infinite? == 1 + ruby_length = ruby_string.length if infinite_length + + # Now, get the bounds. The XPath bounds are 1..length; the ruby bounds + # are 0..length. Therefore, we have to offset the bounds by one. + ruby_start = ruby_start.round - 1 + ruby_length = ruby_length.round + + if ruby_start < 0 + ruby_length += ruby_start unless infinite_length + ruby_start = 0 + end + return '' if ruby_length <= 0 + ruby_string[ruby_start,ruby_length] + end + + # UNTESTED + def Functions::string_length( string ) + string(string).length + end + + # UNTESTED + def Functions::normalize_space( string=nil ) + string = string(@@node) if string.nil? + if string.kind_of? Array + string.collect{|x| string.to_s.strip.gsub(/\s+/um, ' ') if string} + else + string.to_s.strip.gsub(/\s+/um, ' ') + end + end + + # This is entirely Mike Stok's beast + def Functions::translate( string, tr1, tr2 ) + from = string(tr1) + to = string(tr2) + + # the map is our translation table. + # + # if a character occurs more than once in the + # from string then we ignore the second & + # subsequent mappings + # + # if a charactcer maps to nil then we delete it + # in the output. This happens if the from + # string is longer than the to string + # + # there's nothing about - or ^ being special in + # http://www.w3.org/TR/xpath#function-translate + # so we don't build ranges or negated classes + + map = Hash.new + 0.upto(from.length - 1) { |pos| + from_char = from[pos] + unless map.has_key? from_char + map[from_char] = + if pos < to.length + to[pos] + else + nil + end + end + } + + string(string).unpack('U*').collect { |c| + if map.has_key? c then map[c] else c end + }.compact.pack('U*') + end + + # UNTESTED + def Functions::boolean( object=nil ) + if object.kind_of? String + if object =~ /\d+/u + return object.to_f != 0 + else + return object.size > 0 + end + elsif object.kind_of? Array + object = object.find{|x| x and true} + end + return object ? true : false + end + + # UNTESTED + def Functions::not( object ) + not boolean( object ) + end + + # UNTESTED + def Functions::true( ) + true + end + + # UNTESTED + def Functions::false( ) + false + end + + # UNTESTED + def Functions::lang( language ) + lang = false + node = @@node + attr = nil + until node.nil? + if node.node_type == :element + attr = node.attributes["xml:lang"] + unless attr.nil? + lang = compare_language(string(language), attr) + break + else + end + end + node = node.parent + end + lang + end + + def Functions::compare_language lang1, lang2 + lang2.downcase.index(lang1.downcase) == 0 + end + + # a string that consists of optional whitespace followed by an optional + # minus sign followed by a Number followed by whitespace is converted to + # the IEEE 754 number that is nearest (according to the IEEE 754 + # round-to-nearest rule) to the mathematical value represented by the + # string; any other string is converted to NaN + # + # boolean true is converted to 1; boolean false is converted to 0 + # + # a node-set is first converted to a string as if by a call to the string + # function and then converted in the same way as a string argument + # + # an object of a type other than the four basic types is converted to a + # number in a way that is dependent on that type + def Functions::number( object=nil ) + object = @@node unless object + if object == true + Float(1) + elsif object == false + Float(0) + elsif object.kind_of? Array + string( object ).to_f + elsif object.kind_of? Float + object + else + object.to_s.to_f + end + end + + def Functions::sum( nodes ) + end + + def Functions::floor( number ) + number(number).floor + end + + def Functions::ceiling( number ) + number(number).ceil + end + + def Functions::round( number ) + begin + number(number).round + rescue FloatDomainError + number(number) + end + end + + def Functions::method_missing( id ) + puts "METHOD MISSING #{id.id2name}" + XPath.match( @@node, id.id2name ) + end + end +end diff --git a/lib/rexml/instruction.rb b/lib/rexml/instruction.rb new file mode 100644 index 0000000000..0b770d4b3d --- /dev/null +++ b/lib/rexml/instruction.rb @@ -0,0 +1,62 @@ +require "rexml/child" +require "rexml/source" + +module REXML + # Represents an XML Instruction; IE, + # TODO: Add parent arg (3rd arg) to constructor + class Instruction < Child + START = '<\?' + STOP = '\?>' + + # target is the "name" of the Instruction; IE, the "tag" in + # content is everything else. + attr_accessor :target, :content + + # Constructs a new Instruction + # @param target can be one of a number of things. If String, then + # the target of this instruction is set to this. If an Instruction, + # then the Instruction is shallowly cloned (target and content are + # copied). If a Source, then the source is scanned and parsed for + # an Instruction declaration. + # @param content Must be either a String, or a Parent. Can only + # be a Parent if the target argument is a Source. Otherwise, this + # String is set as the content of this instruction. + def initialize(target, content=nil) + if target.kind_of? String + super() + @target = target + @content = content + elsif target.kind_of? Instruction + super(content) + @target = target.target + @content = target.content + end + @content.strip! if @content + end + + def clone + Instruction.new self + end + + def write writer, indent=-1, transitive=false, ie_hack=false + indent(writer, indent) + writer << START.sub(/\\/u, '') + writer << @target + writer << ' ' + writer << @content + writer << STOP.sub(/\\/u, '') + end + + # @return true if other is an Instruction, and the content and target + # of the other matches the target and content of this object. + def ==( other ) + other.kind_of? Instruction and + other.target == @target and + other.content == @content + end + + def node_type + :processing_instruction + end + end +end diff --git a/lib/rexml/light/node.rb b/lib/rexml/light/node.rb new file mode 100644 index 0000000000..5b7b95a7dc --- /dev/null +++ b/lib/rexml/light/node.rb @@ -0,0 +1,232 @@ +require 'rexml/xmltokens' +require 'rexml/light/node' + +# Development model +# document = Node.new + +# Add an element "foo" to the document +# foo = document << "foo" +# # Set attribute "attr" on foo +# foo["attr"] = "la" +# # Set another attribute in a different namespace +# foo["attr", "namespace"] = "too" +# # Swap foo into another namespace +# foo.namespace = "blah" +# # Add a couple of element nodes to foo +# foo << "a" +# foo << "b" +# # Access the children of foo in various ways +# a = foo[0] +# foo.each { |child| +# #... +# } +# # Add text to foo +# # Add instruction +# # Add comment +# # Get the root of the document +# document == a.root +# # Write the document out +# puts document.to_s +module REXML + module Light + # Represents a tagged XML element. Elements are characterized by + # having children, attributes, and names, and can themselves be + # children. + class Node < Array + alias :_old_get :[] + alias :_old_put :[]= + + NAMESPLIT = /^(?:(#{XMLTokens::NCNAME_STR}):)?(#{XMLTokens::NCNAME_STR})/u + # Create a new element. + def initialize node=nil + if node.kind_of? String + node = [ :text, node ] + elsif node.nil? + node = [ :start_document, nil, nil ] + end + replace( node ) + _old_put( 1, 0, 1 ) + _old_put( 1, nil ) + end + + def size + el!() + super-4 + end + + def each( &block ) + el!() + size.times { |x| yield( at(x+4) ) } + end + + def name + el!() + at(2) + end + + def name=( name_str, ns=nil ) + el!() + pfx = '' + pfx = "#{prefix(ns)}:" if ns + _old_put(1, "#{pfx}#{name_str}") + end + + def parent=( node ) + _old_put(1,node) + end + + def local_name + el!() + namesplit + @name + end + + def local_name=( name_str ) + el!() + _old_put( 1, "#@prefix:#{name_str}" ) + end + + def prefix( namespace=nil ) + el!() + prefix_of( self, namespace ) + end + + def namespace( prefix=prefix() ) + el!() + namespace_of( self, prefix ) + end + + def namespace=( namespace ) + el!() + @prefix = prefix( namespace ) + pfx = '' + pfx = "#@prefix:" if @prefix.size > 0 + _old_put(1, "#{pfx}#@name") + end + + def []( reference, ns=nil ) + el!() + if reference.kind_of? String + pfx = '' + pfx = "#{prefix(ns)}:" if ns + at(3)["#{pfx}#{reference}"] + elsif reference.kind_of? Range + _old_get( Range.new(4+reference.begin, reference.end, reference.exclude_end?) ) + else + _old_get( 4+reference ) + end + end + + # Doesn't handle namespaces yet + def []=( reference, ns, value=nil ) + el!() + if reference.kind_of? String + value = ns unless value + at( 3 )[reference] = value + elsif reference.kind_of? Range + _old_put( Range.new(3+reference.begin, reference.end, reference.exclude_end?), ns ) + else + if value + _old_put( 4+reference, ns, value ) + else + _old_put( 4+reference, ns ) + end + end + end + + # Append a child to this element, optionally under a provided namespace. + # The namespace argument is ignored if the element argument is an Element + # object. Otherwise, the element argument is a string, the namespace (if + # provided) is the namespace the element is created in. + def << element + if text? + at(-1) << element + else + newnode = Node.new( element ) + newnode.parent = self + self.push( newnode ) + end + at(-1) + end + + def node_type + self[0] + end + + def text=( foo ) + replace = at(4).kind_of? String ? 1 : 0 + self._old_put(4,replace, normalizefoo) + end + + def root + context = self + context = context.at(1) while context.at(1) + end + + def element? + at(0) == :start_element + end + + def has_name?( name, namespace = '' ) + el!() + at(3) == name and namespace() == namespace + end + + def children + el!() + self + end + + def parent + at(1) + end + + def text? + at(0) == :text + end + + def to_s + + end + + def el! + if text?() + _old_put( 0, :start_element ) + push({}) + end + end + + private + + def namesplit + return if @name.defined? + at(2) =~ NAMESPLIT + @prefix = '' || $1 + @name = $2 + end + + def namespace_of( node, prefix=nil ) + if not prefix + name = at(2) + name =~ NAMESPLIT + prefix = $1 + end + to_find = 'xmlns' + to_find = "xmlns:#{prefix}" if not prefix.nil? + ns = at(3)[ to_find ] + ns ? ns : namespace_of( @node[0], prefix ) + end + + def prefix_of( node, namespace=nil ) + if not namespace + name = node.name + name =~ NAMESPLIT + $1 + else + ns = at(3).find { |k,v| v == namespace } + ns ? ns : prefix_of( node.parent, namespace ) + end + end + end + end +end diff --git a/lib/rexml/namespace.rb b/lib/rexml/namespace.rb new file mode 100644 index 0000000000..3e8790580b --- /dev/null +++ b/lib/rexml/namespace.rb @@ -0,0 +1,47 @@ +require 'rexml/xmltokens' + +module REXML + # Adds named attributes to an object. + module Namespace + # The name of the object, valid if set + attr_reader :name, :expanded_name + # The expanded name of the object, valid if name is set + attr_accessor :prefix + include XMLTokens + NAMESPLIT = /^(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})/u + + # Sets the name and the expanded name + def name=( name ) + @expanded_name = name + name =~ NAMESPLIT + if $1 + @prefix = $1 + else + @prefix = "" + @namespace = "" + end + @name = $2 + end + + # Compares names optionally WITH namespaces + def has_name?( other, ns=nil ) + if ns + return (namespace() == ns and name() == other) + elsif other.include? ":" + return fully_expanded_name == other + else + return name == other + end + end + + alias :local_name :name + + # Fully expand the name, even if the prefix wasn't specified in the + # source file. + def fully_expanded_name + ns = prefix + return "#{ns}:#@name" if ns.size > 0 + return @name + end + end +end diff --git a/lib/rexml/node.rb b/lib/rexml/node.rb new file mode 100644 index 0000000000..41d9eee43b --- /dev/null +++ b/lib/rexml/node.rb @@ -0,0 +1,35 @@ +require "rexml/parseexception" + +module REXML + # Represents a node in the tree. Nodes are never encountered except as + # superclasses of other objects. Nodes have siblings. + module Node + # @return the next sibling (nil if unset) + def next_sibling_node + return nil if @parent.nil? + @parent[ @parent.index(self) + 1 ] + end + + # @return the previous sibling (nil if unset) + def previous_sibling_node + return nil if @parent.nil? + ind = @parent.index(self) + return nil if ind == 0 + @parent[ ind - 1 ] + end + + def to_s indent=-1 + rv = "" + write rv,indent + rv + end + + def indent to, ind + to << " "*ind unless ind<1 + end + + def parent? + false; + end + end +end diff --git a/lib/rexml/output.rb b/lib/rexml/output.rb new file mode 100644 index 0000000000..7d4ab2e13b --- /dev/null +++ b/lib/rexml/output.rb @@ -0,0 +1,22 @@ +require 'rexml/encoding' + +module REXML + class Output + include Encoding + attr_reader :encoding + def initialize real_IO, encd="iso-8859-1" + @output = real_IO + self.encoding = encd + + eval <<-EOL + alias :encode :to_#{encoding.tr('-', '_').downcase} + alias :decode :from_#{encoding.tr('-', '_').downcase} + EOL + @to_utf = encd == UTF_8 ? false : true + end + + def <<( content ) + @output << (@to_utf ? encode(content) : content) + end + end +end diff --git a/lib/rexml/parent.rb b/lib/rexml/parent.rb new file mode 100644 index 0000000000..5c1ed97324 --- /dev/null +++ b/lib/rexml/parent.rb @@ -0,0 +1,165 @@ +require "rexml/child" + +module REXML + # A parent has children, and has methods for accessing them. The Parent + # class is never encountered except as the superclass for some other + # object. + class Parent < Child + include Enumerable + + # Constructor + # @param parent if supplied, will be set as the parent of this object + def initialize parent=nil + super(parent) + @children = [] + end + + def add( object ) + #puts "PARENT GOTS #{size} CHILDREN" + object.parent = self + @children << object + #puts "PARENT NOW GOTS #{size} CHILDREN" + object + end + + alias :push :add + alias :<< :push + + def unshift( object ) + object.parent = self + @children.unshift object + end + + def delete( object ) + return unless @children.include? object + @children.delete object + object.parent = nil + end + + def each(&block) + @children.each(&block) + end + + def delete_if( &block ) + @children.delete_if(&block) + end + + def delete_at( index ) + @children.delete_at index + end + + def each_index( &block ) + @children.each_index(&block) + end + + # Fetches a child at a given index + # @param index the Integer index of the child to fetch + def []( index ) + @children[index] + end + + alias :each_child :each + + + + # Set an index entry. See Array.[]= + # @param index the index of the element to set + # @param opt either the object to set, or an Integer length + # @param child if opt is an Integer, this is the child to set + # @return the parent (self) + def []=( *args ) + args[-1].parent = self + @children[*args[0..-2]] = args[-1] + end + + # Inserts an child before another child + # @param child1 this is either an xpath or an Element. If an Element, + # child2 will be inserted before child1 in the child list of the parent. + # If an xpath, child2 will be inserted before the first child to match + # the xpath. + # @param child2 the child to insert + # @return the parent (self) + def insert_before( child1, child2 ) + if child1.kind_of? String + child1 = XPath.first( self, child1 ) + child1.parent.insert_before child1, child2 + else + ind = index(child1) + child2.parent.delete(child2) if child2.parent + @children[ind,0] = child2 + child2.parent = self + end + self + end + + # Inserts an child after another child + # @param child1 this is either an xpath or an Element. If an Element, + # child2 will be inserted after child1 in the child list of the parent. + # If an xpath, child2 will be inserted after the first child to match + # the xpath. + # @param child2 the child to insert + # @return the parent (self) + def insert_after( child1, child2 ) + if child1.kind_of? String + child1 = XPath.first( self, child1 ) + child1.parent.insert_after child1, child2 + else + ind = index(child1)+1 + child2.parent.delete(child2) if child2.parent + @children[ind,0] = child2 + child2.parent = self + end + self + end + + def to_a + @children.dup + end + + # Fetches the index of a given child + # @param child the child to get the index of + # @return the index of the child, or nil if the object is not a child + # of this parent. + def index( child ) + count = -1 + @children.find { |i| count += 1 ; i.hash == child.hash } + count + end + + # @return the number of children of this parent + def size + @children.size + end + + # Replaces one child with another, making sure the nodelist is correct + # @param to_replace the child to replace (must be a Child) + # @param replacement the child to insert into the nodelist (must be a + # Child) + def replace_child( to_replace, replacement ) + ind = @children.index( to_replace ) + to_replace.parent = nil + @children[ind,0] = replacement + replacement.parent = self + end + + # Deeply clones this object. This creates a complete duplicate of this + # Parent, including all descendants. + def deep_clone + cl = clone() + each do |child| + if child.kind_of? Parent + cl << child.deep_clone + else + cl << child.clone + end + end + cl + end + + alias :children :to_a + + def parent? + true + end + end +end diff --git a/lib/rexml/parseexception.rb b/lib/rexml/parseexception.rb new file mode 100644 index 0000000000..04928d9175 --- /dev/null +++ b/lib/rexml/parseexception.rb @@ -0,0 +1,44 @@ +module REXML + class ParseException < Exception + attr_accessor :source, :parser, :continued_exception + + def initialize( message, source=nil, parser=nil, exception=nil ) + super(message) + @source = source + @parser = parser + @continued_exception = exception + end + + def to_s + # Quote the original exception, if there was one + if @continued_exception + err = @continued_exception.message + err << "\n" + err << @continued_exception.backtrace[0..3].join("\n") + err << "\n...\n" + else + err = "" + end + + # Get the stack trace and error message + err << super + + # Add contextual information + err << "\n#{@source.current_line}\nLast 80 unconsumed characters:\n#{@source.buffer[0..80].gsub(/\n/, ' ')}\n" if @source + err << "\nContext:\n#{@parser.context}" if @parser + err + end + + def position + @source.current_line[0] if @source + end + + def line + @source.current_line[2] if @source + end + + def context + @source.current_line + end + end +end diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb new file mode 100644 index 0000000000..e5c2cf7d96 --- /dev/null +++ b/lib/rexml/parsers/baseparser.rb @@ -0,0 +1,391 @@ +require 'rexml/parseexception' +require 'rexml/source' + +module REXML + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class BaseParser + NCNAME_STR= '[\w:][-\w\d.]*' + NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + + NAMECHAR = '[-\w\d\.:]' + NAME = "([\\w:]#{NAMECHAR}*)" + NMTOKEN = "(?:#{NAMECHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + + DOCTYPE_START = /\A\s*)/um + ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um + COMMENT_START = /\A/um + CDATA_START = /\A/um + XMLDECL_START = /\A<\?xml\s/u; + XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>*/um + INSTRUCTION_START = /\A<\?/u + INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um + TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{NAME_STR}\s*=\s*(["']).*?\3)*)\s*(\/)?>/um + CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + + VERSION = /\bversion\s*=\s*["'](.*?)['"]/um + ENCODING = /\bencoding=["'](.*?)['"]/um + STANDALONE = /\bstandalone=["'](.*?)['"]/um + + ENTITY_START = /^\s*/um + ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)" + NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)" + ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))" + ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})" + ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')" + DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))" + ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}" + ATTLISTDECL_START = /^\s*/um + NOTATIONDECL_START = /^\s*/um + SYSTEM = /^\s*/um + + TEXT_PATTERN = /\A([^<]*)/um + + # Entity constants + PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9-()+,./:=?;!*@$_%#" + SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))} + PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')} + EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))" + NDATADECL = "\\s+NDATA\\s+#{NAME}" + PEREFERENCE = "%#{NAME};" + ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))} + PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})" + ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))" + PEDECL = "" + GEDECL = "" + ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um + + EREFERENCE = /&(?!#{NAME};)/ + + DEFAULT_ENTITIES = { + 'gt' => [/>/, '>', '>'], + 'lt' => [/</, '<', '<'], + 'quot' => [/"/, '"', '"'], + "apos" => [/'/, "'", "'"] + } + + def initialize( source ) + self.stream = source + end + + def stream=( source ) + if source.kind_of? String + @source = Source.new(source) + elsif source.kind_of? IO + @source = IOSource.new(source) + elsif source.kind_of? Source + @source = source + else + raise "#{source.type} is not a valid input stream. It must be \n"+ + "either a String, IO, or Source." + end + @closed = nil + @document_status = nil + @tags = [] + @stack = [] + @entities = [] + end + + # Returns true if there are no more events + def empty? + !has_next? + end + + # Returns true if there are more events. Synonymous with !empty? + def has_next? + @source.read if @source.buffer.size==0 and !@source.empty? + (!@source.empty? and @source.buffer.strip.size>0) or @stack.size>0 or @closed + end + + # Push an event back on the head of the stream. This method + # has (theoretically) infinite depth. + def unshift token + @stack.unshift(token) + end + + # Peek at the +depth+ event in the stack. The first element on the stack + # is at depth 0. If +depth+ is -1, will parse to the end of the input + # stream and return the last event, which is always :end_document. + # Be aware that this causes the stream to be parsed up to the +depth+ + # event, so you can effectively pre-parse the entire document (pull the + # entire thing into memory) using this method. + def peek depth=0 + raise 'Illegal argument "#{depth}"' if depth < -1 + temp = [] + if depth == -1 + temp.push(pull()) until empty? + else + while @stack.size+temp.size < depth+1 + temp.push(pull()) + end + end + @stack += temp if temp.size > 0 + @stack[depth] + end + + # Returns the next event. This is a +PullEvent+ object. + def pull + return [ :end_document ] if empty? + if @closed + x, @closed = @closed, nil + return [ :end_element, x ] + end + return @stack.shift if @stack.size > 0 + @source.read if @source.buffer.size==0 + if @document_status == nil + @source.match( /^\s*/um, true ) + word = @source.match( /^\s*(<.*?)>/um ) + word = word[1] unless word.nil? + case word + when COMMENT_START + return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + when XMLDECL_START + results = @source.match( XMLDECL_PATTERN, true )[1] + version = VERSION.match( results ) + version = version[1] unless version.nil? + encoding = ENCODING.match(results) + encoding = encoding[1] unless encoding.nil? + @source.encoding = encoding + standalone = STANDALONE.match(results) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone] + when INSTRUCTION_START + return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + when DOCTYPE_START + md = @source.match( DOCTYPE_PATTERN, true ) + identity = md[1] + close = md[2] + identity =~ IDENTITY + name = $1 + raise "DOCTYPE is missing a name" if name.nil? + pub_sys = $2.nil? ? nil : $2.strip + long_name = $3.nil? ? nil : $3.strip + uri = $4.nil? ? nil : $4.strip + args = [ :start_doctype, name, pub_sys, long_name, uri ] + if close == ">" + @document_status = :after_doctype + @source.read if @source.buffer.size==0 + md = @source.match(/^\s*/um, true) + @stack << [ :end_doctype ] + else + @document_status = :in_doctype + end + return args + else + @document_status = :after_doctype + @source.read if @source.buffer.size==0 + md = @source.match(/\s*/um, true) + end + end + if @document_status == :in_doctype + md = @source.match(/\s*(.*?>)/um) + case md[1] + when ELEMENTDECL_START + return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ] + when ENTITY_START + match = @source.match( ENTITYDECL, true ).to_a.compact + match[0] = :entitydecl + ref = false + if match[1] == '%' + ref = true + match.delete_at 1 + end + # Now we have to sort out what kind of entity reference this is + if match[2] == 'SYSTEM' + # External reference + match[3] = match[3][1..-2] # PUBID + match.delete_at(4) if match.size > 4 # Chop out NDATA decl + # match is [ :entity, name, SYSTEM, pubid(, ndata)? ] + elsif match[2] == 'PUBLIC' + # External reference + match[3] = match[3][1..-2] # PUBID + match[4] = match[4][1..-2] # HREF + # match is [ :entity, name, PUBLIC, pubid, href ] + else + match[2] = match[2][1..-2] + match.pop if match.size == 4 + # match is [ :entity, name, value ] + end + match << '%' if ref + return match + when ATTLISTDECL_START + md = @source.match( ATTLISTDECL_PATTERN, true ) + raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil? + element = md[1] + contents = md[0] + + pairs = {} + values = md[0].scan( ATTDEF ) + values.each do |attdef| + unless attdef[3] == "#IMPLIED" + attdef.compact! + val = attdef[3] + val = attdef[4] if val == "#FIXED " + pairs[attdef[0]] = val + end + end + return [ :attlistdecl, element, pairs, contents ] + when NOTATIONDECL_START + md = nil + if @source.match( PUBLIC ) + md = @source.match( PUBLIC, true ) + elsif @source.match( SYSTEM ) + md = @source.match( SYSTEM, true ) + else + raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source ) + end + return [ :notationdecl, md[1], md[2], md[3] ] + when /^\s*]\s*>/um + @document_status = :after_doctype + @source.match( /^\s*]\s*>/um, true ) + return [ :end_doctype ] + end + end + begin + if @source.buffer[0] == ?< + if @source.buffer[1] == ?/ + last_tag = @tags.pop + md = @source.match( CLOSE_MATCH, true ) + raise REXML::ParseException.new( "Missing end tag for '#{last_tag}' "+ + "(got \"#{md[1]}\")", @source) unless last_tag == md[1] + return [ :end_element, last_tag ] + elsif @source.buffer[1] == ?! + md = @source.match(/\A(\s*[^>]*>)/um) + #puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}" + raise REXML::ParseException.new("Malformed node", @source) unless md + case md[1] + when CDATA_START + return [ :cdata, @source.match( CDATA_PATTERN, true )[1] ] + when COMMENT_START + return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ] + else + raise REXML::ParseException.new( "Declarations can only occur "+ + "in the doctype declaration.", @source) + end + elsif @source.buffer[1] == ?? + md = @source.match( INSTRUCTION_PATTERN, true ) + return [ :processing_instruction, md[1], md[2] ] + else + # Get the next tag + md = @source.match(TAG_MATCH, true) + raise REXML::ParseException.new("malformed XML: missing tag start", @source) unless md + attrs = [] + if md[2].size > 0 + attrs = md[2].scan( ATTRIBUTE_PATTERN ) + raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 + end + + if md[4] + @closed = md[1] + else + @tags.push( md[1] ) + end + attributes = {} + attrs.each { |a,b,c| attributes[a] = c } + return [ :start_element, md[1], attributes ] + end + else + md = @source.match(TEXT_PATTERN, true) + raise "no text to add" if md[0].length == 0 + # unnormalized = Text::unnormalize( md[1], self ) + # return PullEvent.new( :text, md[1], unnormalized ) + return [ :text, md[1] ] + end + rescue REXML::ParseException + raise $! + rescue Exception, NameError => error + raise REXML::ParseException.new( "Exception parsing", + @source, self, error ) + end + return [ :dummy ] + end + + def entity( reference, entities ) + value = nil + value = entities[ reference ] if entities + if not value + value = DEFAULT_ENTITIES[ reference ] + value = value[2] if value + end + unnormalize( value, entities ) if value + end + + # Escapes all possible entities + def normalize( input, entities=nil, entity_filter=nil ) + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( EREFERENCE, '&' ) + entities.each do |key, value| + copy.gsub!( value, "&#{key};" ) unless entity_filter and + entity_filter.include?(entity) + end if entities + copy.gsub!( EREFERENCE, '&' ) + DEFAULT_ENTITIES.each do |key, value| + copy.gsub!( value[2], value[1] ) + end + copy + end + + # Unescapes all possible entities + def unnormalize( string, entities=nil, filter=nil ) + rv = string.clone + rv.gsub!( /\r\n?/, "\n" ) + matches = rv.scan( REFERENCE) + return rv if matches.size == 0 + rv.gsub!( /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m| + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + matches.collect!{|x|x[0]}.compact! + if matches.size > 0 + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = entity( entity_reference, entities ) + if entity_value + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value ) + end + end + end + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + er = DEFAULT_ENTITIES[entity_reference] + rv.gsub!( er[0], er[2] ) if er + end + end + rv.gsub!( /&/, '&' ) + end + rv + end + end + end +end diff --git a/lib/rexml/parsers/lightparser.rb b/lib/rexml/parsers/lightparser.rb new file mode 100644 index 0000000000..e2f083bc8e --- /dev/null +++ b/lib/rexml/parsers/lightparser.rb @@ -0,0 +1,56 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' +require 'rexml/light/node' + +module REXML + module Parsers + class LightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end + + def rewind + @stream.rewind + @parser.stream = @stream + end + + def parse + root = context = REXML::Light::Node.new([ :document ]) + while true + event = @parser.pull + case event[0] + when :end_document + break + when :end_doctype + context = context.parent + when :start_element, :start_doctype + new_node = REXML::Light::Node.new(event) + context << new_node + new_node.parent = context + context = new_node + when :end_element, :end_doctype + context = context.parent + else + new_node = REXML::Light::Node.new(event) + context << new_node + new_node.parent = context + end + end + root + end + end + + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end +end diff --git a/lib/rexml/parsers/pullparser.rb b/lib/rexml/parsers/pullparser.rb new file mode 100644 index 0000000000..aeda6251fe --- /dev/null +++ b/lib/rexml/parsers/pullparser.rb @@ -0,0 +1,143 @@ +require 'rexml/parseexception' +require 'rexml/parsers/baseparser' +require 'rexml/xmltokens' + +module REXML + module Parsers + # = Using the Pull Parser + # This API is experimental, and subject to change. + # parser = PullParser.new( "texttxet" ) + # while parser.has_next? + # res = parser.next + # puts res[1]['att'] if res.start_tag? and res[0] == 'b' + # end + # See the PullEvent class for information on the content of the results. + # The data is identical to the arguments passed for the various events to + # the StreamListener API. + # + # Notice that: + # parser = PullParser.new( "BAD DOCUMENT" ) + # while parser.has_next? + # res = parser.next + # raise res[1] if res.error? + # end + # + # Nat Price gave me some good ideas for the API. + class PullParser < BaseParser + include XMLTokens + + def initialize stream + super + @entities = {} + end + + def each + while has_next? + yield self.pull + end + end + + def peek depth=0 + PullEvent.new(super) + end + + def pull + event = super + case event[0] + when :entitydecl + @entities[ event[1] ] = + event[2] unless event[2] =~ /PUBLIC|SYSTEM/ + when :text + unnormalized = unnormalize( event[1], @entities ) + event << unnormalized + end + PullEvent.new( event ) + end + end + + # A parsing event. The contents of the event are accessed as an +Array?, + # and the type is given either by the ...? methods, or by accessing the + # +type+ accessor. The contents of this object vary from event to event, + # but are identical to the arguments passed to +StreamListener+s for each + # event. + class PullEvent + # The type of this event. Will be one of :tag_start, :tag_end, :text, + # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, + # :notationdecl, :entity, :cdata, :xmldecl, or :error. + def initialize(arg) + @contents = arg + end + def []( index ) + @contents[index+1] + end + def event_type + @contents[0] + end + # Content: [ String tag_name, Hash attributes ] + def start_element? + @contents[0] == :start_element + end + # Content: [ String tag_name ] + def end_element? + @contents[0] == :end_element + end + # Content: [ String raw_text, String unnormalized_text ] + def text? + @contents[0] == :text + end + # Content: [ String text ] + def instruction? + @contents[0] == :processing_instruction + end + # Content: [ String text ] + def comment? + @contents[0] == :comment + end + # Content: [ String name, String pub_sys, String long_name, String uri ] + def doctype? + @contents[0] == :start_doctype + end + # Content: [ String text ] + def attlistdecl? + @contents[0] == :attlistdecl + end + # Content: [ String text ] + def elementdecl? + @contents[0] == :elementdecl + end + # Due to the wonders of DTDs, an entity declaration can be just about + # anything. There's no way to normalize it; you'll have to interpret the + # content yourself. However, the following is true: + # + # * If the entity declaration is an internal entity: + # [ String name, String value ] + # Content: [ String text ] + def entitydecl? + @contents[0] == :entitydecl + end + # Content: [ String text ] + def notationdecl? + @contents[0] == :notationdecl + end + # Content: [ String text ] + def entity? + @contents[0] == :entity + end + # Content: [ String text ] + def cdata? + @contents[0] == :cdata + end + # Content: [ String version, String encoding, String standalone ] + def xmldecl? + @contents[0] == :xmldecl + end + def error? + @contents[0] == :error + end + + def inspect + @contents[0].to_s + ": " + @contents[1..-1].inspect + end + end + end +end diff --git a/lib/rexml/parsers/sax2parser.rb b/lib/rexml/parsers/sax2parser.rb new file mode 100644 index 0000000000..8598fd43e9 --- /dev/null +++ b/lib/rexml/parsers/sax2parser.rb @@ -0,0 +1,204 @@ +module REXML + module Parsers + class SAX2Parser + def initialize source + @parser = BaseParser.new(source) + @listeners = [] + @procs = [] + @namespace_stack = [] + @has_listeners = false + @tag_stack = [] + end + + # Listen arguments: + # + # Symbol, Array, Block + # Listen to Symbol events on Array elements + # Symbol, Block + # Listen to Symbol events + # Array, Listener + # Listen to all events on Array elements + # Array, Block + # Listen to :start_element events on Array elements + # Listener + # Listen to All events + # + # Symbol can be one of: :start_element, :end_element, + # :start_prefix_mapping, :end_prefix_mapping, :characters, + # :processing_instruction, :doctype, :attlistdecl, :elementdecl, + # :entitydecl, :notationdecl, :cdata, :xmldecl, :comment + # + # Array contains regular expressions or strings which will be matched + # against fully qualified element names. + # + # Listener must implement the methods in SAX2Listener + # + # Block will be passed the same arguments as a SAX2Listener method would + # be, where the method name is the same as the matched Symbol. + # See the SAX2Listener for more information. + def listen( *args, &blok ) + if args[0].kind_of? Symbol + if args.size == 2 + args[1].each { |match| @procs << [args[0], match, blok] } + else + add( [args[0], /.*/, blok] ) + end + elsif args[0].kind_of? Array + if args.size == 2 + args[0].each { |match| add( [nil, match, args[1]] ) } + else + args[0].each { |match| add( [ :start_element, match, blok ] ) } + end + else + add([nil, /.*/, args[0]]) + end + end + + def deafen( listener=nil, &blok ) + if listener + @listeners.delete_if {|item| item[-1] == listener } + @has_listeners = false if @listeners.size == 0 + else + @procs.delete_if {|item| item[-1] == blok } + end + end + + def parse + @procs.each { |sym,match,block| block.call if sym == :start_document } + @listeners.each { |sym,match,block| + block.start_document if sym == :start_document or sym.nil? + } + root = context = [] + while true + event = @parser.pull + case event[0] + when :end_document + handle( :end_document ) + break + when :end_doctype + context = context[1] + when :start_element + @tag_stack.push(event[1]) + # find the observers for namespaces + procs = get_procs( :start_prefix_mapping, event[1] ) + listeners = get_listeners( :start_prefix_mapping, event[1] ) + if procs or listeners + # break out the namespace declarations + # The attributes live in event[2] + nsdecl = event[2].find_all { |n, value| n =~ /^xmlns:/ } + nsdecl.collect! { |n, value| [ n[6..-1], value ] } + @namespace_stack.push({}) + nsdecl.each do |n,v| + @namespace_stack[-1][n] = v + # notify observers of namespaces + procs.each { |ob| ob.call( n, v ) } if procs + listeners.each { |ob| ob.start_prefix_mapping(n, v) } if listeners + end + end + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :start_element, event[1] ) + listeners = get_listeners( :start_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1], event[2] ) } if procs + listeners.each { |ob| + ob.start_element( uri, local, event[1], event[2] ) + } if listeners + when :end_element + @tag_stack.pop + event[1] =~ Namespace::NAMESPLIT + prefix = $1 + local = $2 + uri = get_namespace(prefix) + # find the observers for start_element + procs = get_procs( :end_element, event[1] ) + listeners = get_listeners( :end_element, event[1] ) + # notify observers + procs.each { |ob| ob.call( uri, local, event[1] ) } if procs + listeners.each { |ob| + ob.end_element( uri, local, event[1] ) + } if listeners + + namespace_mapping = @namespace_stack.pop + # find the observers for namespaces + procs = get_procs( :end_prefix_mapping, event[1] ) + listeners = get_listeners( :end_prefix_mapping, event[1] ) + if procs or listeners + namespace_mapping.each do |prefix, uri| + # notify observers of namespaces + procs.each { |ob| ob.call( prefix ) } if procs + listeners.each { |ob| ob.end_prefix_mapping(prefix) } if listeners + end + end + when :text + normalized = @parser.normalize( event[1] ) + handle( :characters, normalized ) + when :processing_instruction, :comment, :doctype, :attlistdecl, + :elementdecl, :entitydecl, :cdata, :notationdecl, :xmldecl + handle( *event ) + end + end + end + + private + def handle( symbol, *arguments ) + tag = @tag_stack[-1] + procs = get_procs( symbol, tag ) + listeners = get_listeners( symbol, tag ) + # notify observers + procs.each { |ob| ob.call( *arguments ) } if procs + listeners.each { |l| + l.send( symbol.to_s, *arguments ) + } if listeners + end + + # The following methods are duplicates, but it is faster than using + # a helper + def get_procs( symbol, name ) + return nil if @procs.size == 0 + @procs.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + (name.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end + def get_listeners( symbol, name ) + return nil if @listeners.size == 0 + @listeners.find_all do |sym, match, block| + ( + (sym.nil? or symbol == sym) and + (name.nil? or ( + (name == match) or + (match.kind_of? Regexp and name =~ match) + ) + ) + ) + end.collect{|x| x[-1]} + end + + def add( pair ) + if pair[-1].kind_of? Proc + @procs << pair unless @procs.include? pair + else + @listeners << pair unless @listeners.include? pair + @has_listeners = true + end + end + + def get_namespace( prefix ) + uri = @namespace_stack.find do |ns| + not ns[prefix].nil? + end + uri[prefix] unless uri.nil? + end + end + end +end diff --git a/lib/rexml/parsers/streamparser.rb b/lib/rexml/parsers/streamparser.rb new file mode 100644 index 0000000000..51441289d9 --- /dev/null +++ b/lib/rexml/parsers/streamparser.rb @@ -0,0 +1,33 @@ +module REXML + module Parsers + class StreamParser + def initialize source, listener + @listener = listener + @parser = BaseParser.new( source ) + end + + def parse + # entity string + while true + event = @parser.pull + case event[0] + when :end_document + return + when :start_element + @listener.tag_start( event[1], event[2] ) + when :end_element + @listener.tag_end( event[1] ) + when :text + normalized = @parser.unnormalize( event[1] ) + @listener.text( normalized ) + when :processing_instruction + @listener.instruction( *event[1,2] ) + when :comment, :doctype, :attlistdecl, + :elementdecl, :entitydecl, :cdata, :notationdecl, :xmldecl + @listener.send( event[0].to_s, *event[1..-1] ) + end + end + end + end + end +end diff --git a/lib/rexml/parsers/ultralightparser.rb b/lib/rexml/parsers/ultralightparser.rb new file mode 100644 index 0000000000..f3b208bf0f --- /dev/null +++ b/lib/rexml/parsers/ultralightparser.rb @@ -0,0 +1,52 @@ +require 'rexml/parsers/streamparser' +require 'rexml/parsers/baseparser' + +module REXML + module Parsers + class UltraLightParser + def initialize stream + @stream = stream + @parser = REXML::Parsers::BaseParser.new( stream ) + end + + def rewind + @stream.rewind + @parser.stream = @stream + end + + def parse + root = context = [] + while true + event = @parser.pull + case event[0] + when :end_document + break + when :end_doctype + context = context[1] + when :start_element, :doctype + context << event + event[1,0] = [context] + context = event + when :end_element + context = context[1] + else + context << event + end + end + root + end + end + + # An element is an array. The array contains: + # 0 The parent element + # 1 The tag name + # 2 A hash of attributes + # 3..-1 The child elements + # An element is an array of size > 3 + # Text is a String + # PIs are [ :processing_instruction, target, data ] + # Comments are [ :comment, data ] + # DocTypes are DocType structs + # The root is an array with XMLDecls, Text, DocType, Array, Text + end +end diff --git a/lib/rexml/parsers/xpathparser.rb b/lib/rexml/parsers/xpathparser.rb new file mode 100644 index 0000000000..da27e7c705 --- /dev/null +++ b/lib/rexml/parsers/xpathparser.rb @@ -0,0 +1,598 @@ +require 'rexml/namespace' +require 'rexml/xmltokens' + +module REXML + module Parsers + # You don't want to use this class. Really. Use XPath, which is a wrapper + # for this class. Believe me. You don't want to poke around in here. + # There is strange, dark magic at work in this code. Beware. Go back! Go + # back while you still can! + class XPathParser + include XMLTokens + LITERAL = /^'([^']*)'|^"([^"]*)"/u + + def namespaces=( namespaces ) + Functions::namespace_context = namespaces + @namespaces = namespaces + end + + def parse path + path.gsub!(/([\(\[])\s+/, '\1') # Strip ignorable spaces + path.gsub!( /\s+([\]\)])/, '\1' ) + parsed = [] + path = LocationPath(path, parsed) + parsed + end + + def predicate path + parsed = [] + Predicate( "[#{path}]", parsed ) + parsed + end + + def to_string( path ) + string = "" + while path.size > 0 + case path[0] + when :ancestor, :ancestor_or_self, :attribute, :child, :descendant, :descendant_or_self, :following, :following_sibling, :namespace, :parent, :preceding, :preceding_sibling, :self + op = path.shift + string << "/" unless string.size == 0 + string << op.to_s + string << "::" + when :any + path.shift + string << "*" + when :qname + path.shift + prefix = path.shift + name = path.shift + string << prefix+":" if prefix.size > 0 + string << name + when :predicate + path.shift + string << '[' + string << predicate_to_string( path.shift ) + string << ' ]' + else + string << "/" unless string.size == 0 + string << "UNKNOWN(" + string << path.shift.inspect + string << ")" + end + end + return string + end + + def predicate_to_string( path ) + string = "" + case path[0] + when :and, :or, :mult, :plus, :minus, :neq, :eq, :lt, :gt, :lteq, :gteq, :div, :mod, :neq, :union + op = path.shift + left = predicate_to_string( path.shift ) + right = predicate_to_string( path.shift ) + string << " " + string << left + string << " " + string << op.to_s + string << " " + string << right + string << " " + when :function + path.shift + name = path.shift + string << name + string << "( " + string << predicate_to_string( path.shift ) + string << " )" + when :literal + path.shift + string << " " + string << path.shift.inspect + string << " " + else + string << " " + string << to_string( path ) + string << " " + end + return string.squeeze(" ") + end + + private + #LocationPath + # | RelativeLocationPath + # | '/' RelativeLocationPath? + # | '//' RelativeLocationPath + def LocationPath path, parsed + #puts "LocationPath '#{path}'" + path = path.strip + if path[0] == ?/ + parsed << :document + if path[1] == ?/ + parsed << :descendant_or_self + parsed << :node + path = path[2..-1] + else + path = path[1..-1] + end + end + #puts parsed.inspect + return RelativeLocationPath( path, parsed ) if path.size > 0 + end + + #RelativeLocationPath + # | Step + # | (AXIS_NAME '::' | '@' | '') AxisSpecifier + # NodeTest + # Predicate + # | '.' | '..' AbbreviatedStep + # | RelativeLocationPath '/' Step + # | RelativeLocationPath '//' Step + AXIS = /^(ancestor|ancestor-or-self|attribute|child|descendant|descendant-or-self|following|following-sibling|namespace|parent|preceding|preceding-sibling|self)::/ + def RelativeLocationPath path, parsed + #puts "RelativeLocationPath #{path}" + while path.size > 0 + # (axis or @ or ) nodetest predicate > + # OR > / Step + # (. or ..) > + if path[0] == ?. + if path[1] == ?. + parsed << :parent + parsed << :node + path = path[2..-1] + else + parsed << :self + parsed << :node + path = path[1..-1] + end + else + if path[0] == ?@ + #puts "ATTRIBUTE" + parsed << :attribute + path = path[1..-1] + # Goto Nodetest + elsif path =~ AXIS + parsed << $1.tr('-','_').intern + path = $' + # Goto Nodetest + else + parsed << :child + end + + #puts "NODETESTING '#{path}'" + n = [] + path = NodeTest( path, n) + #puts "NODETEST RETURNED '#{path}'" + + if path[0] == ?[ + path = Predicate( path, n ) + end + + parsed.concat(n) + end + + if path.size > 0 + if path[0] == ?/ + if path[1] == ?/ + parsed << :descendant_or_self + parsed << :node + path = path[2..-1] + else + path = path[1..-1] + end + else + return path + end + end + end + return path + end + + # Returns a 1-1 map of the nodeset + # The contents of the resulting array are either: + # true/false, if a positive match + # String, if a name match + #NodeTest + # | ('*' | NCNAME ':' '*' | QNAME) NameTest + # | NODE_TYPE '(' ')' NodeType + # | PI '(' LITERAL ')' PI + # | '[' expr ']' Predicate + NCNAMETEST= /^(#{NCNAME_STR}):\*/u + QNAME = Namespace::NAMESPLIT + NODE_TYPE = /^(comment|text|node)\(\s*\)/m + PI = /^processing-instruction\(/ + def NodeTest path, parsed + #puts "NodeTest with #{path}" + res = nil + case path + when /^\*/ + path = $' + parsed << :any + when NODE_TYPE + type = $1 + path = $' + parsed << type.tr('-', '_').intern + when PI + path = $' + literal = nil + if path !~ /^\s*\)/ + path =~ LITERAL + literal = $1 + path = $' + raise ParseException.new("Missing ')' after processing instruction") if path[0] != ?) + path = path[1..-1] + end + parsed << :processing_instruction + parsed << literal + when NCNAMETEST + #puts "NCNAMETEST" + prefix = $1 + path = $' + parsed << :namespace + parsed << prefix + when QNAME + #puts "QNAME" + prefix = $1 + name = $2 + path = $' + prefix = "" unless prefix + parsed << :qname + parsed << prefix + parsed << name + end + return path + end + + # Filters the supplied nodeset on the predicate(s) + def Predicate path, parsed + #puts "PREDICATE with #{path}" + return nil unless path[0] == ?[ + predicates = [] + while path[0] == ?[ + path, expr = get_group(path) + predicates << expr[1..-2] if expr + end + #puts "PREDICATES = #{predicates.inspect}" + predicates.each{ |expr| + #puts "ORING #{expr}" + preds = [] + parsed << :predicate + parsed << preds + OrExpr(expr, preds) + } + #puts "PREDICATES = #{predicates.inspect}" + path + end + + # The following return arrays of true/false, a 1-1 mapping of the + # supplied nodeset, except for axe(), which returns a filtered + # nodeset + + #| OrExpr S 'or' S AndExpr + #| AndExpr + def OrExpr path, parsed + #puts "OR >>> #{path}" + n = [] + rest = AndExpr( path, n ) + #puts "OR <<< #{rest}" + if rest != path + while rest =~ /^\s*( or )/ + n = [ :or, n, [] ] + rest = AndExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| AndExpr S 'and' S EqualityExpr + #| EqualityExpr + def AndExpr path, parsed + #puts "AND >>> #{path}" + n = [] + rest = EqualityExpr( path, n ) + #puts "AND <<< #{rest}" + if rest != path + while rest =~ /^\s*( and )/ + n = [ :and, n, [] ] + #puts "AND >>> #{rest}" + rest = EqualityExpr( $', n[-1] ) + #puts "AND <<< #{rest}" + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| EqualityExpr ('=' | '!=') RelationalExpr + #| RelationalExpr + def EqualityExpr path, parsed + #puts "EQUALITY >>> #{path}" + n = [] + rest = RelationalExpr( path, n ) + #puts "EQUALITY <<< #{rest}" + if rest != path + while rest =~ /^\s*(!?=)\s*/ + if $1[0] == ?! + n = [ :neq, n, [] ] + else + n = [ :eq, n, [] ] + end + rest = RelationalExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| RelationalExpr ('<' | '>' | '<=' | '>=') AdditiveExpr + #| AdditiveExpr + def RelationalExpr path, parsed + #puts "RELATION >>> #{path}" + n = [] + rest = AdditiveExpr( path, n ) + #puts "RELATION <<< #{rest}" + if rest != path + while rest =~ /^\s*([<>]=?)\s*/ + if $1[0] == ?< + sym = "lt" + else + sym = "gt" + end + sym << "eq" if $1[-1] == ?= + n = [ sym.intern, n, [] ] + rest = AdditiveExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| AdditiveExpr ('+' | S '-') MultiplicativeExpr + #| MultiplicativeExpr + def AdditiveExpr path, parsed + #puts "ADDITIVE >>> #{path}" + n = [] + rest = MultiplicativeExpr( path, n ) + #puts "ADDITIVE <<< #{rest}" + if rest != path + while rest =~ /^\s*(\+| -)\s*/ + if $1[0] == ?+ + n = [ :plus, n, [] ] + else + n = [ :minus, n, [] ] + end + rest = MultiplicativeExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| MultiplicativeExpr ('*' | S ('div' | 'mod') S) UnaryExpr + #| UnaryExpr + def MultiplicativeExpr path, parsed + #puts "MULT >>> #{path}" + n = [] + rest = UnaryExpr( path, n ) + #puts "MULT <<< #{rest}" + if rest != path + while rest =~ /^\s*(\*| div | mod )\s*/ + if $1[0] == ?* + n = [ :mult, n, [] ] + elsif $1.include?( "div" ) + n = [ :div, n, [] ] + else + n = [ :mod, n, [] ] + end + rest = UnaryExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace(n) + elsif n.size > 0 + parsed << n + end + rest + end + + #| '-' UnaryExpr + #| UnionExpr + def UnaryExpr path, parsed + path =~ /^(\-*)/ + path = $' + if $1 and (($1.size % 2) != 0) + mult = -1 + else + mult = 1 + end + parsed << :neg if mult < 0 + + #puts "UNARY >>> #{path}" + n = [] + path = UnionExpr( path, n ) + #puts "UNARY <<< #{path}" + parsed.concat( n ) + path + end + + #| UnionExpr '|' PathExpr + #| PathExpr + def UnionExpr path, parsed + #puts "UNION >>> #{path}" + n = [] + rest = PathExpr( path, n ) + #puts "UNION <<< #{rest}" + if rest != path + while rest =~ /^\s*(\|)\s*/ + n = [ :union, n, [] ] + rest = PathExpr( $', n[-1] ) + end + end + if parsed.size == 0 and n.size != 0 + parsed.replace( n ) + elsif n.size > 0 + parsed << n + end + rest + end + + #| LocationPath + #| FilterExpr ('/' | '//') RelativeLocationPath + def PathExpr path, parsed + path =~ /^\s*/ + path = $' + #puts "PATH >>> #{path}" + n = [] + rest = FilterExpr( path, n ) + #puts "PATH <<< '#{rest}'" + if rest != path + if rest and rest[0] == ?/ + return RelativeLocationPath(rest, n) + end + end + #puts "BEFORE WITH '#{rest}'" + rest = LocationPath(rest, n) if rest =~ /^[\/\.\@\[\w_*]/ + parsed.concat(n) + return rest + end + + #| FilterExpr Predicate + #| PrimaryExpr + def FilterExpr path, parsed + #puts "FILTER >>> #{path}" + n = [] + path = PrimaryExpr( path, n ) + #puts "FILTER <<< #{path}" + path = Predicate(path, n) if path and path[0] == ?[ + #puts "FILTER <<< #{path}" + parsed.concat(n) + path + end + + #| VARIABLE_REFERENCE + #| '(' expr ')' + #| LITERAL + #| NUMBER + #| FunctionCall + VARIABLE_REFERENCE = /^\$(#{NAME_STR})/u + NUMBER = /^(\d*\.?\d+)/ + NT = /^comment|text|processing-instruction|node$/ + def PrimaryExpr path, parsed + arry = [] + case path + when VARIABLE_REFERENCE + varname = $1 + path = $' + parsed << :variable + parsed << varname + #arry << @variables[ varname ] + when /^(\w[-\w]*)(?:\()/ + fname = $1 + path = $' + return nil if fname =~ NT + parsed << :function + parsed << fname + path = FunctionCall(path, parsed) + when LITERAL, NUMBER + #puts "LITERAL or NUMBER: #$1" + varname = $1.nil? ? $2 : $1 + path = $' + parsed << :literal + parsed << varname + when /^\(/ #/ + path, contents = get_group(path) + contents = contents[1..-2] + n = [] + OrExpr( contents, n ) + parsed.concat(n) + end + path + end + + #| FUNCTION_NAME '(' ( expr ( ',' expr )* )? ')' + def FunctionCall rest, parsed + path, arguments = parse_args(rest) + argset = [] + for argument in arguments + args = [] + OrExpr( argument, args ) + argset << args + end + parsed << argset + path + end + + # get_group( '[foo]bar' ) -> ['bar', '[foo]'] + def get_group string + ind = 0 + depth = 0 + st = string[0,1] + en = (st == "(" ? ")" : "]") + begin + case string[ind,1] + when st + depth += 1 + when en + depth -= 1 + end + ind += 1 + end while depth > 0 and ind < string.length + return nil unless depth==0 + [string[ind..-1], string[0..ind-1]] + end + + def parse_args( string ) + arguments = [] + ind = 0 + depth = 1 + begin + case string[ind] + when ?( + depth += 1 + if depth == 1 + string = string[1..-1] + ind -= 1 + end + when ?) + depth -= 1 + if depth == 0 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] + end + when ?, + if depth == 1 + s = string[0,ind].strip + arguments << s unless s == "" + string = string[ind+1..-1] + ind = 0 + end + end + ind += 1 + end while depth > 0 and ind < string.length + return nil unless depth==0 + [string,arguments] + end + end + end +end diff --git a/lib/rexml/quickpath.rb b/lib/rexml/quickpath.rb new file mode 100644 index 0000000000..2c54ac1999 --- /dev/null +++ b/lib/rexml/quickpath.rb @@ -0,0 +1,266 @@ +require 'rexml/functions' +require 'rexml/xmltokens' + +module REXML + class QuickPath + include Functions + include XMLTokens + + EMPTY_HASH = {} + + def QuickPath::first element, path, namespaces=EMPTY_HASH + match(element, path, namespaces)[0] + end + + def QuickPath::each element, path, namespaces=EMPTY_HASH, &block + path = "*" unless path + match(element, path, namespaces).each( &block ) + end + + def QuickPath::match element, path, namespaces=EMPTY_HASH + raise "nil is not a valid xpath" unless path + results = nil + Functions::namespace_context = namespaces + case path + when /^\/([^\/]|$)/u + # match on root + path = path[1..-1] + return [element.root.parent] if path == '' + results = filter([element.root], path) + when /^[-\w]*::/u + results = filter([element], path) + when /^\*/u + results = filter(element.to_a, path) + when /^[[!\w:]/u + # match on child + matches = [] + children = element.to_a + results = filter(children, path) + else + results = filter([element], path) + end + return results + end + + # Given an array of nodes it filters the array based on the path. The + # result is that when this method returns, the array will contain elements + # which match the path + def QuickPath::filter elements, path + return elements if path.nil? or path == '' or elements.size == 0 + case path + when /^\/\//u # Descendant + return axe( elements, "descendant-or-self", $' ) + when /^\/?\b(\w[-\w]*)\b::/u # Axe + axe_name = $1 + rest = $' + return axe( elements, $1, $' ) + when /^\/(?=\b([:!\w][-\.\w]*:)?[-!\*\.\w]*\b([^:(]|$)|\*)/u # Child + rest = $' + results = [] + elements.each do |element| + results |= filter( element.to_a, rest ) + end + return results + when /^\/?(\w[-\w]*)\(/u # / Function + return function( elements, $1, $' ) + when Namespace::NAMESPLIT # Element name + name = $2 + ns = $1 + rest = $' + elements.delete_if do |element| + !(element.kind_of? Element and + (element.expanded_name == name or + (element.name == name and + element.namespace == Functions.namespace_context[ns]))) + end + return filter( elements, rest ) + when /^\/\[/u + matches = [] + elements.each do |element| + matches |= predicate( element.to_a, path[1..-1] ) if element.kind_of? Element + end + return matches + when /^\[/u # Predicate + return predicate( elements, path ) + when /^\/?\.\.\./u # Ancestor + return axe( elements, "ancestor", $' ) + when /^\/?\.\./u # Parent + return filter( elements.collect{|e|e.parent}, $' ) + when /^\/?\./u # Self + return filter( elements, $' ) + when /^\*/u # Any + results = [] + elements.each do |element| + results |= filter( [element], $' ) if element.kind_of? Element + #if element.kind_of? Element + # children = element.to_a + # children.delete_if { |child| !child.kind_of?(Element) } + # results |= filter( children, $' ) + #end + end + return results + end + return [] + end + + def QuickPath::axe( elements, axe_name, rest ) + matches = [] + matches = filter( elements.dup, rest ) if axe_name =~ /-or-self$/u + case axe_name + when /^descendant/u + elements.each do |element| + matches |= filter( element.to_a, "descendant-or-self::#{rest}" ) if element.kind_of? Element + end + when /^ancestor/u + elements.each do |element| + while element.parent + matches << element.parent + element = element.parent + end + end + matches = filter( matches, rest ) + when "self" + matches = filter( elements, rest ) + when "child" + elements.each do |element| + matches |= filter( element.to_a, rest ) if element.kind_of? Element + end + when "attribute" + elements.each do |element| + matches << element.attributes[ rest ] if element.kind_of? Element + end + when "parent" + matches = filter(elements.collect{|element| element.parent}.uniq, rest) + when "following-sibling" + matches = filter(elements.collect{|element| element.next_sibling}.uniq, + rest) + when "previous-sibling" + matches = filter(elements.collect{|element| + element.previous_sibling}.uniq, rest ) + end + return matches.uniq + end + + # A predicate filters a node-set with respect to an axis to produce a + # new node-set. For each node in the node-set to be filtered, the + # PredicateExpr is evaluated with that node as the context node, with + # the number of nodes in the node-set as the context size, and with the + # proximity position of the node in the node-set with respect to the + # axis as the context position; if PredicateExpr evaluates to true for + # that node, the node is included in the new node-set; otherwise, it is + # not included. + # + # A PredicateExpr is evaluated by evaluating the Expr and converting + # the result to a boolean. If the result is a number, the result will + # be converted to true if the number is equal to the context position + # and will be converted to false otherwise; if the result is not a + # number, then the result will be converted as if by a call to the + # boolean function. Thus a location path para[3] is equivalent to + # para[position()=3]. + def QuickPath::predicate( elements, path ) + ind = 1 + bcount = 1 + while bcount > 0 + bcount += 1 if path[ind] == ?[ + bcount -= 1 if path[ind] == ?] + ind += 1 + end + ind -= 1 + predicate = path[1..ind-1] + rest = path[ind+1..-1] + + # have to change 'a [=<>] b [=<>] c' into 'a [=<>] b and b [=<>] c' + predicate.gsub!( /([^\s(and)(or)<>=]+)\s*([<>=])\s*([^\s(and)(or)<>=]+)\s*([<>=])\s*([^\s(and)(or)<>=]+)/u ) { + "#$1 #$2 #$3 and #$3 #$4 #$5" + } + # Let's do some Ruby trickery to avoid some work: + predicate.gsub!( /&/u, "&&" ) + predicate.gsub!( /=/u, "==" ) + predicate.gsub!( /@(\w[-\w.]*)/u ) { + "attribute(\"#$1\")" + } + predicate.gsub!( /\bmod\b/u, "%" ) + predicate.gsub!( /\b(\w[-\w.]*\()/u ) { + fname = $1 + fname.gsub( /-/u, "_" ) + } + + Functions.pair = [ 0, elements.size ] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = eval( predicate ) + case res + when true + results << element + when Fixnum + results << element if Functions.pair[0] == res + when String + results << element + end + end + return filter( results, rest ) + end + + def QuickPath::attribute( name ) + return Functions.node.attributes[name] if Functions.node.kind_of? Element + end + + def QuickPath::name() + return Functions.node.name if Functions.node.kind_of? Element + end + + def QuickPath::method_missing( id, *args ) + begin + Functions.send( id.id2name, *args ) + rescue Exception + raise "METHOD: #{id.id2name}(#{args.join ', '})\n#{$!.message}" + end + end + + def QuickPath::function( elements, fname, rest ) + args = parse_args( elements, rest ) + Functions.pair = [0, elements.size] + results = [] + elements.each do |element| + Functions.pair[0] += 1 + Functions.node = element + res = Functions.send( fname, *args ) + case res + when true + results << element + when Fixnum + results << element if Functions.pair[0] == res + end + end + return results + end + + def QuickPath::parse_args( element, string ) + # /.*?(?:\)|,)/ + arguments = [] + buffer = "" + while string and string != "" + c = string[0] + string.sub!(/^./u, "") + case c + when ?, + # if depth = 1, then we start a new argument + arguments << evaluate( buffer ) + #arguments << evaluate( string[0..count] ) + when ?( + # start a new method call + function( element, buffer, string ) + buffer = "" + when ?) + # close the method call and return arguments + return arguments + else + buffer << c + end + end + "" + end + end +end diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb new file mode 100644 index 0000000000..fdf9386dff --- /dev/null +++ b/lib/rexml/rexml.rb @@ -0,0 +1,26 @@ +# REXML is an XML parser for Ruby, in Ruby. +# +# URL: http://www.germane-software.com/software/rexml +# Author: Sean Russell +# Version: 2.5.6 +# Date: +2003/054 + + + +# +# Short Description: +# Why did I write REXML? At the time of this writing, there were already +# two XML parsers for Ruby. The first is a Ruby binding to a native XML +# parser. This is a fast parser, using proven technology. However, +# it isn't very portable. The second is a native Ruby implementation, but +# I didn't like its API very much. I wrote REXML for myself, so that I'd +# have an XML parser that had an intuitive API. +# +# API documentation can be downloaded from the REXML home page, or can +# be accessed online at http://www.germane-software.com/software/rexml_doc +# A tutorial is available in docs/tutorial.html +module REXML + Copyright = 'Copyright #{Time.now.year} Sean Russell ' + Date = "@ANT_DATE@" + Version = "@ANT_VERSION@" +end diff --git a/lib/rexml/sax2listener.rb b/lib/rexml/sax2listener.rb new file mode 100644 index 0000000000..40a77ed464 --- /dev/null +++ b/lib/rexml/sax2listener.rb @@ -0,0 +1,94 @@ +module REXML + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + # === Missing methods from SAX2 + # ignorable_whitespace + # === Methods extending SAX2 + # +WARNING+ + # These methods are certainly going to change, until DTDs are fully + # supported. Be aware of this. + # start_document + # end_document + # doctype + # elementdecl + # attlistdecl + # entitydecl + # notationdecl + # cdata + # xmldecl + # comment + module SAX2Listener + def start_document + end + def end_document + end + def start_prefix_mapping prefix, uri + end + def end_prefix_mapping prefix + end + def start_element uri, localname, qname, attributes + end + def end_element uri, localname, qname + end + def characters text + end + def processing_instruction target, data + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl(element, pairs, contents) + end + # + def elementdecl content + end + # + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # + # ["%", "YN", "'\"Yes\"'", "\""] + # + # ["%", "YN", "'Yes'", "s"] + # + # ["WhatHeSaid", "\"He said %YN;\"", "YN"] + # + # ["open-hatch", "SYSTEM", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] + # + # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] + # + # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] + def entitydecl content + end + # + def notationdecl content + end + # Called when is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil + def xmldecl version, encoding, standalone + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end + end +end diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb new file mode 100644 index 0000000000..8c175785b7 --- /dev/null +++ b/lib/rexml/source.rb @@ -0,0 +1,191 @@ +require 'rexml/encoding' + +module REXML + # Generates Source-s. USE THIS CLASS. + class SourceFactory + # Generates a Source object + # @param arg Either a String, or an IO + # @return a Source, or nil if a bad argument was given + def SourceFactory::create_from arg#, slurp=true + if arg.kind_of? String + source = Source.new(arg) + elsif arg.kind_of? IO + source = IOSource.new(arg) + end + source + end + end + + # A Source can be searched for patterns, and wraps buffers and other + # objects and provides consumption of text + class Source + include Encoding + # The current buffer (what we're going to read next) + attr_reader :buffer + # The line number of the last consumed text + attr_reader :line + attr_reader :encoding + + # Constructor + # @param arg must be a String, and should be a valid XML document + def initialize arg + @orig = @buffer = arg + self.encoding = check_encoding( @buffer ) + #@buffer = decode(@buffer) unless @encoding == UTF_8 + @line = 0 + end + + # Inherited from Encoding + # Overridden to support optimized en/decoding + def encoding=(enc) + super + eval <<-EOL + alias :encode :to_#{encoding.tr('-', '_').downcase} + alias :decode :from_#{encoding.tr('-', '_').downcase} + EOL + @line_break = encode( '>' ) + if enc != UTF_8 + @buffer = decode(@buffer) + @to_utf = true + else + @to_utf = false + end + end + + # Scans the source for a given pattern. Note, that this is not your + # usual scan() method. For one thing, the pattern argument has some + # requirements; for another, the source can be consumed. You can easily + # confuse this method. Originally, the patterns were easier + # to construct and this method more robust, because this method + # generated search regexes on the fly; however, this was + # computationally expensive and slowed down the entire REXML package + # considerably, since this is by far the most commonly called method. + # @param pattern must be a Regexp, and must be in the form of + # /^\s*(#{your pattern, with no groups})(.*)/. The first group + # will be returned; the second group is used if the consume flag is + # set. + # @param consume if true, the pattern returned will be consumed, leaving + # everything after it in the Source. + # @return the pattern, if found, or nil if the Source is empty or the + # pattern is not found. + def scan pattern, consume=false + return nil if @buffer.nil? + rv = @buffer.scan(pattern) + @buffer = $' if consume and rv.size>0 + rv + end + + def read + end + + def match pattern, consume=false + md = pattern.match @buffer + @buffer = $' if consume and md + return md + end + + # @return true if the Source is exhausted + def empty? + @buffer.nil? or @buffer.strip.nil? + end + + # @return the current line in the source + def current_line + lines = @orig.split + res = lines.grep @buffer[0..30] + res = res[-1] if res.kind_of? Array + lines.index( res ) if res + end + end + + # A Source that wraps an IO. See the Source class for method + # documentation + class IOSource < Source + #attr_reader :block_size + + def initialize arg, block_size=500 + @er_source = @source = arg + @to_utf = false + # READLINE OPT + # The following was commented out when IOSource started using readline + # to pull the data from the stream. + #@block_size = block_size + #super @source.read(@block_size) + @line_break = '>' + super @source.readline( @line_break ) + end + + def scan pattern, consume=false + rv = super + # You'll notice that this next section is very similar to the same + # section in match(), but just a liiittle different. This is + # because it is a touch faster to do it this way with scan() + # than the way match() does it; enough faster to warrent duplicating + # some code + if rv.size == 0 + until @buffer =~ pattern or @source.nil? + begin + # READLINE OPT + #str = @source.read(@block_size) + str = @source.readline(@line_break) + str = decode(str) if @to_utf and str + @buffer << str + rescue + @source = nil + end + end + rv = super + end + rv.taint + rv + end + + def read + begin + str = @source.readline('>') + str = decode(str) if @to_utf and str + @buffer << str + rescue + @source = nil + end + end + + def match pattern, consume=false + rv = pattern.match(@buffer) + @buffer = $' if consume and rv + while !rv and @source + begin + str = @source.readline('>') + str = decode(str) if @to_utf and str + @buffer << str + rv = pattern.match(@buffer) + @buffer = $' if consume and rv + rescue + @source = nil + end + end + rv.taint + rv + end + + def empty? + super and ( @source.nil? || @source.eof? ) + end + + # @return the current line in the source + def current_line + pos = @er_source.pos # The byte position in the source + lineno = @er_source.lineno # The XML < position in the source + @er_source.rewind + line = 0 # The \r\n position in the source + begin + while @er_source.pos < pos + @er_source.readline + line += 1 + end + rescue + end + [pos, lineno, line] + end + end +end diff --git a/lib/rexml/streamlistener.rb b/lib/rexml/streamlistener.rb new file mode 100644 index 0000000000..3c3c5e3684 --- /dev/null +++ b/lib/rexml/streamlistener.rb @@ -0,0 +1,89 @@ +module REXML + # A template for stream parser listeners. + # Note that the declarations (attlistdecl, elementdecl, etc) are trivially + # processed; REXML doesn't yet handle doctype entity declarations, so you + # have to parse them out yourself. + module StreamListener + # Called when a tag is encountered. + # @p name the tag name + # @p attrs an array of arrays of attribute/value pairs, suitable for + # use with assoc or rassoc. IE, + # will result in + # tag_start( "tag", # [["attr1","value1"],["attr2","value2"]]) + def tag_start name, attrs + end + # Called when the end tag is reached. In the case of , tag_end + # will be called immidiately after tag_start + # @p the name of the tag + def tag_end name + end + # Called when text is encountered in the document + # @p text the text content. + def text text + end + # Called when an instruction is encountered. EG: + # @p name the instruction name; in the example, "xsl" + # @p instruction the rest of the instruction. In the example, + # "sheet='foo'" + def instruction name, instruction + end + # Called when a comment is encountered. + # @p comment The content of the comment + def comment comment + end + # Handles a doctype declaration. Any attributes of the doctype which are + # not supplied will be nil. # EG, + # @p name the name of the doctype; EG, "me" + # @p pub_sys "PUBLIC", "SYSTEM", or nil. EG, "PUBLIC" + # @p long_name the supplied long name, or nil. EG, "foo" + # @p uri the uri of the doctype, or nil. EG, "bar" + def doctype name, pub_sys, long_name, uri + end + # If a doctype includes an ATTLIST declaration, it will cause this + # method to be called. The content is the declaration itself, unparsed. + # EG, will come to this method as "el + # attr CDATA #REQUIRED". This is the same for all of the .*decl + # methods. + def attlistdecl element_name, attributes, raw_content + end + # + def elementdecl content + end + # + # The argument passed to this method is an array of the entity + # declaration. It can be in a number of formats, but in general it + # returns (example, result): + # + # ["%", "YN", "'\"Yes\"'", "\""] + # + # ["%", "YN", "'Yes'", "s"] + # + # ["WhatHeSaid", "\"He said %YN;\"", "YN"] + # + # ["open-hatch", "SYSTEM", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] + # + # ["open-hatch", "PUBLIC", "\"-//Textuality//TEXT Standard open-hatch boilerplate//EN\"", "\"http://www.textuality.com/boilerplate/OpenHatch.xml\""] + # + # ["hatch-pic", "SYSTEM", "\"../grafix/OpenHatch.gif\"", "\n\t\t\t\t\t\t\tNDATA gif", "gif"] + def entitydecl content + end + # + def notationdecl content + end + # Called when %foo; is encountered in a doctype declaration. + # @p content "foo" + def entity content + end + # Called when is encountered in a document. + # @p content "..." + def cdata content + end + # Called when an XML PI is encountered in the document. + # EG: + # @p version the version attribute value. EG, "1.0" + # @p encoding the encoding attribute value, or nil. EG, "utf" + # @p standalone the standalone attribute value, or nil. EG, nil + def xmldecl version, encoding, standalone + end + end +end diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb new file mode 100644 index 0000000000..906f4d41fc --- /dev/null +++ b/lib/rexml/text.rb @@ -0,0 +1,279 @@ +require 'rexml/entity' + +module REXML + # Represents text nodes in an XML document + class Text < Child + include Comparable + # The order in which the substitutions occur + SPECIALS = [ /&(?!#?[\w-]+;)/u, //u, /"/u, /'/u, /\r/u ] + SUBSTITUTES = ['&', '<', '>', '"', ''', ' '] + # Characters which are substituted in written strings + SLAICEPS = [ '<', '>', '"', "'", '&' ] + SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ] + + # If +raw+ is true, then REXML leaves the value alone + attr_accessor :raw + + ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um + NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + + # Constructor + # +arg+ if a String, the content is set to the String. If a Text, + # the object is shallowly cloned. + # + # +respect_whitespace+ (boolean, false) if true, whitespace is + # respected + # + # +parent+ (nil) if this is a Parent object, the parent + # will be set to this. + # + # +raw+ (nil) This argument can be given three values. + # If true, then the value of used to construct this object is expected to + # contain no unescaped XML markup, and REXML will not change the text. If + # this value is false, the string may contain any characters, and REXML will + # escape any and all defined entities whose values are contained in the + # text. If this value is nil (the default), then the raw value of the + # parent will be used as the raw value for this node. If there is no raw + # value for the parent, and no value is supplied, the default is false. + # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, true ) #-> IllegalArgumentException + # Text.new( "<&", false, nil, true ) #-> "<&" + # # Assume that the entity "s" is defined to be "sean" + # # and that the entity "r" is defined to be "russell" + # Text.new( "sean russell" ) #-> "&s; &r;" + # Text.new( "sean russell", false, nil, true ) #-> "sean russell" + # + # +entity_filter+ (nil) This can be an array of entities to match in the + # supplied text. This argument is only useful if +raw+ is set to false. + # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell" + # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" + # In the last example, the +entity_filter+ argument is ignored. + # + # +pattern+ INTERNAL USE ONLY + def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, + entity_filter=nil, illegal=ILLEGAL ) + + @raw = false + + if parent + super( parent ) + @raw = parent.raw + else + @parent = nil + end + + @raw = raw unless raw.nil? + @entity_filter = entity_filter + @normalized = @unnormalized = nil + + if arg.kind_of? String + @string = arg.clone + @string.squeeze!(" \n\t") unless respect_whitespace + elsif arg.kind_of? Text + @string = arg.to_s + @raw = arg.raw + elsif + raise Exception.new( "Illegal argument of type #{arg.type} for Text constructor (#{arg})" ) + end + + @string.gsub!( /\r\n?/, "\n" ) + + # check for illegal characters + if @raw + if @string =~ illegal + raise Exception.new( + "Illegal character '#{$1}' in raw string \"#{@string}\"" + ) + end + end + end + + def node_type + :text + end + + def empty? + @string.size==0 + end + + + def clone + return Text.new(self) + end + + + # Appends text to this text node. The text is appended in the +raw+ mode + # of this text node. + def <<( to_append ) + @string << to_append.gsub( /\r\n?/, "\n" ) + end + + + # +other+ a String or a Text + # +returns+ the result of (to_s <=> arg.to_s) + def <=>( other ) + to_s() <=> other.to_s + end + + REFERENCE = /#{Entity::REFERENCE}/ + # Returns the string value of this text node. This string is always + # escaped, meaning that it is a valid XML text node string, and all + # entities that can be escaped, have been inserted. This method respects + # the entity filter set in the constructor. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.to_s #-> "< & &s; russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.to_s #-> "< & &s; russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.to_s #-> "sean russell" + def to_s + return @string if @raw + return @normalized if @normalized + + doctype = nil + if @parent + doc = @parent.document + doctype = doc.doctype if doc + end + + @normalized = Text::normalize( @string, doctype, @entity_filter ) + end + + # Returns the string value of this text. This is the text without + # entities, as it might be used programmatically, or printed to the + # console. This ignores the 'raw' attribute setting, and any + # entity_filter. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.string #-> "< & sean russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.string #-> "< & sean russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.string #-> "sean russell" + def value + @unnormalized if @unnormalized + doctype = nil + if @parent + doc = @parent.document + doctype = doc.doctype if doc + end + @unnormalized = Text::unnormalize( @string, doctype ) + end + + def write( writer, indent=-1, transitive=false, ie_hack=false ) + writer << to_s() + end + + # Writes out text, substituting special characters beforehand. + # +out+ A String, IO, or any other object supporting <<( String ) + # +input+ the text to substitute and the write out + # + # z=utf8.unpack("U*") + # ascOut="" + # z.each{|r| + # if r < 0x100 + # ascOut.concat(r.chr) + # else + # ascOut.concat(sprintf("&#x%x;", r)) + # end + # } + # puts ascOut + def write_with_substitution out, input + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( SPECIALS[0], SUBSTITUTES[0] ) + copy.gsub!( SPECIALS[1], SUBSTITUTES[1] ) + copy.gsub!( SPECIALS[2], SUBSTITUTES[2] ) + copy.gsub!( SPECIALS[3], SUBSTITUTES[3] ) + copy.gsub!( SPECIALS[4], SUBSTITUTES[4] ) + copy.gsub!( SPECIALS[5], SUBSTITUTES[5] ) + out << copy + end + + # Reads text, substituting entities + def Text::read_with_substitution( input, illegal=nil ) + copy = input.clone + + if copy =~ illegal + raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) + end if illegal + + copy.gsub!( /\r\n?/, "\n" ) + if copy.include? ?& + copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) + copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] ) + copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) + copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) + copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) + copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) {|m| + m=$1 + #m='0' if m=='' + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + end + copy + end + + EREFERENCE = /&(?!#{Entity::NAME};)/ + # Escapes all possible entities + def Text::normalize( input, doctype=nil, entity_filter=nil ) + copy = input.clone + # Doing it like this rather than in a loop improves the speed + if doctype + copy.gsub!( EREFERENCE, '&' ) + doctype.entities.each_value do |entity| + copy.gsub!( entity.value, + "&#{entity.name};" ) if entity.value and + not( entity_filter and entity_filter.include?(entity) ) + end + else + copy.gsub!( EREFERENCE, '&' ) + DocType::DEFAULT_ENTITIES.each_value do |entity| + copy.gsub!(entity.value, "&#{entity.name};" ) + end + end + copy + end + + # Unescapes all possible entities + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) + rv = string.clone + rv.gsub!( /\r\n?/, "\n" ) + matches = rv.scan REFERENCE + return rv if matches.size == 0 + rv.gsub!( NUMERICENTITY ) {|m| + m=$1 + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + matches.collect!{|x|x[0]}.compact! + if matches.size > 0 + if doctype + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = doctype.entity( entity_reference ) + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value ) if entity_value + end + end + else + matches.each do |entity_reference| + unless filter and filter.include?(entity_reference) + entity_value = DocType::DEFAULT_ENTITIES[ entity_reference ] + re = /&#{entity_reference};/ + rv.gsub!( re, entity_value.value ) if entity_value + end + end + end + rv.gsub!( /&/, '&' ) + end + rv + end + end +end diff --git a/lib/rexml/xmldecl.rb b/lib/rexml/xmldecl.rb new file mode 100644 index 0000000000..6a6cc31a53 --- /dev/null +++ b/lib/rexml/xmldecl.rb @@ -0,0 +1,72 @@ +require 'rexml/encoding' +require 'rexml/source' + +module REXML + # NEEDS DOCUMENTATION + class XMLDecl < Child + include Encoding + + DEFAULT_VERSION = "1.0"; + DEFAULT_ENCODING = "UTF-8"; + DEFAULT_STANDALONE = "no"; + START = '<\?xml'; + STOP = '\?>'; + + attr_accessor :version, :standalone + + def initialize(version=DEFAULT_VERSION, encoding=nil, standalone=nil) + @encoding_set = !encoding.nil? + if version.kind_of? XMLDecl + super() + @version = version.version + self.encoding = version.encoding + @standalone = version.standalone + else + super() + @version = version + self.encoding = encoding + @standalone = standalone + end + @version = DEFAULT_VERSION if @version.nil? + end + + def clone + XMLDecl.new(self) + end + + def write writer, indent=-1, transitive=false, ie_hack=false + indent( writer, indent ) + writer << START.sub(/\\/u, '') + writer << " #{content}" + writer << STOP.sub(/\\/u, '') + end + + def ==( other ) + other.kind_of?(XMLDecl) and + other.version == @version and + other.encoding == self.encoding and + other.standalone == @standalone + end + + def xmldecl version, encoding, standalone + @version = version + @encoding_set = !encoding.nil? + self.encoding = encoding + @standalone = standalone + end + + def node_type + :xmldecl + end + + alias :stand_alone? :standalone + + private + def content + rv = "version='#@version'" + rv << " encoding='#{encoding}'" if @encoding_set + rv << " standalone='#@standalone'" if @standalone + rv + end + end +end diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb new file mode 100644 index 0000000000..6bbe5b07d5 --- /dev/null +++ b/lib/rexml/xmltokens.rb @@ -0,0 +1,18 @@ +module REXML + # Defines a number of tokens used for parsing XML. Not for general + # consumption. + module XMLTokens + NCNAME_STR= '[\w:][\-\w\d.]*' + NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + + NAMECHAR = '[\-\w\d\.:]' + NAME = "([\\w:]#{NAMECHAR}*)" + NMTOKEN = "(?:#{NAMECHAR})+" + NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*" + REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)" + + #REFERENCE = "(?:#{ENTITYREF}|#{CHARREF})" + #ENTITYREF = "&#{NAME};" + #CHARREF = "&#\\d+;|&#x[0-9a-fA-F]+;" + end +end diff --git a/lib/rexml/xpath.rb b/lib/rexml/xpath.rb new file mode 100644 index 0000000000..c9c216fe27 --- /dev/null +++ b/lib/rexml/xpath.rb @@ -0,0 +1,62 @@ +require 'rexml/functions' +require 'rexml/xpath_parser' + +module REXML + # Wrapper class. Use this class to access the XPath functions. + class XPath + include Functions + EMPTY_HASH = {} + + # Finds and returns the first node that matches the supplied xpath. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, returns the first + # node matching '*'. + # namespaces:: + # If supplied, a Hash which defines a namespace mapping. + # + # XPath.first( node ) + # XPath.first( doc, "//b"} ) + # XPath.first( node, "a/x:b", { "x"=>"http://doofus" } ) + def XPath::first element, path=nil, namespaces={}, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element)[0] + end + + # Itterates over nodes that match the given path, calling the supplied + # block with the match. + # element:: + # The context element + # path:: + # The xpath to search for. If not supplied or nil, defaults to '*' + # namespaces:: + # If supplied, a Hash which defines a namespace mapping + # + # XPath.each( node ) { |el| ... } + # XPath.each( node, '/*[@attr='v']' ) { |el| ... } + # XPath.each( node, 'ancestor::x' ) { |el| ... } + def XPath::each element, path=nil, namespaces={}, variables={}, &block + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path, element).each( &block ) + end + + # Returns an array of nodes matching a given XPath. + def XPath::match element, path=nil, namespaces={}, variables={} + parser = XPathParser.new + parser.namespaces = namespaces + parser.variables = variables + path = "*" unless path + element = [element] unless element.kind_of? Array + parser.parse(path,element) + end + end +end diff --git a/lib/rexml/xpath_parser.rb b/lib/rexml/xpath_parser.rb new file mode 100644 index 0000000000..215078b766 --- /dev/null +++ b/lib/rexml/xpath_parser.rb @@ -0,0 +1,530 @@ +require 'rexml/namespace' +require 'rexml/xmltokens' +require 'rexml/parsers/xpathparser' + +# Ignore this class. It adds a __ne__ method, because Ruby doesn't seem to +# understand object.send( "!=", foo ), whereas it *does* understand "<", "==", +# and all of the other comparison methods. Stupid, and annoying, and not at +# all POLS. +class Object + def __ne__(b) + self != b + end +end + +module REXML + # You don't want to use this class. Really. Use XPath, which is a wrapper + # for this class. Believe me. You don't want to poke around in here. + # There is strange, dark magic at work in this code. Beware. Go back! Go + # back while you still can! + class XPathParser + include XMLTokens + LITERAL = /^'([^']*)'|^"([^"]*)"/u + + def initialize( ) + @parser = REXML::Parsers::XPathParser.new + @namespaces = {} + @variables = {} + end + + def namespaces=( namespaces={} ) + Functions::namespace_context = namespaces + @namespaces = namespaces + end + + def variables=( vars={} ) + Functions::variables = vars + @variables = vars + end + + def parse path, nodeset + path_stack = @parser.parse( path ) + #puts "PARSE: #{path} => #{path_stack.inspect}" + match( path_stack, nodeset ) + end + + def predicate path, nodeset + path_stack = @parser.predicate( path ) + return Predicate( path_stack, nodeset ) + end + + def []=( variable_name, value ) + @variables[ variable_name ] = value + end + + private + + def match( path_stack, nodeset ) + while ( path_stack.size > 0 and nodeset.size > 0 ) + #puts "PARSE: #{path_stack.inspect} '#{nodeset.collect{|n|n.type}.inspect}'" + nodeset = internal_parse( path_stack, nodeset ) + #puts "NODESET: #{nodeset.size}" + #puts "PATH_STACK: #{path_stack.inspect}" + end + nodeset + end + + def internal_parse path_stack, nodeset + return nodeset if nodeset.size == 0 or path_stack.size == 0 + #puts "INTERNAL_PARSE: #{path_stack.inspect}, #{nodeset.collect{|n| n.type}.inspect}" + case path_stack.shift + when :document + return [ nodeset[0].root.parent ] + + when :qname + prefix = path_stack.shift + name = path_stack.shift + #puts "QNAME #{prefix}#{prefix.size>0?':':''}#{name}" + n = nodeset.clone + ns = @namespaces[prefix] + ns = ns ? ns : '' + n.delete_if do |node| + # FIXME: This DOUBLES the time XPath searches take + ns = node.namespace( prefix ) if node.node_type == :element and ns == '' + #puts "NODE: '#{node.to_s}'; node.has_name?( #{name.inspect}, #{ns.inspect} ): #{ node.has_name?( name, ns )}; node.namespace() = #{node.namespace().inspect}; node.prefix = #{node.prefix().inspect}" if node.node_type == :element + !(node.node_type == :element and node.name == name and node.namespace == ns ) + end + return n + + when :any + n = nodeset.clone + n.delete_if { |node| node.node_type != :element } + return n + + when :self + # THIS SPACE LEFT INTENTIONALLY BLANK + + when :processing_instruction + target = path_stack.shift + n = nodeset.clone + n.delete_if do |node| + (node.node_type != :processing_instruction) or + ( !target.nil? and ( node.target != target ) ) + end + return n + + when :text + #puts ":TEXT" + n = nodeset.clone + n.delete_if do |node| + #puts "#{node} :: #{node.node_type}" + node.node_type != :text + end + return n + + when :comment + n = nodeset.clone + n.delete_if do |node| + node.node_type != :comment + end + return n + + when :node + return nodeset + #n = nodeset.clone + #n.delete_if do |node| + # !node.node? + #end + #return n + + # FIXME: I suspect the following XPath will fail: + # /a/*/*[1] + when :child + #puts "CHILD" + new_nodeset = [] + ps_clone = nil + for node in nodeset + #ps_clone = path_stack.clone + #new_nodeset += internal_parse( ps_clone, node.children ) if node.parent? + new_nodeset += node.children if node.parent? + end + #path_stack[0,(path_stack.size-ps_clone.size)] = [] + return new_nodeset + + when :literal + literal = path_stack.shift + if literal =~ /^\d+(\.\d+)?$/ + return ($1 ? literal.to_f : literal.to_i) + end + #puts "RETURNING '#{literal}'" + return literal + + when :attribute + #puts ":ATTRIBUTE" + new_nodeset = [] + case path_stack.shift + when :qname + prefix = path_stack.shift + name = path_stack.shift + for element in nodeset + if element.node_type == :element + #puts element.name + #puts "looking for attribute #{name} in '#{@namespaces[prefix]}'" + attr = element.attribute( name, @namespaces[prefix] ) + #puts ":ATTRIBUTE: attr => #{attr}" + new_nodeset << attr if attr + end + end + when :any + for element in nodeset + if element.node_type == :element + attr = element.attributes + end + end + end + #puts "RETURNING #{new_nodeset.collect{|n|n.to_s}.inspect}" + return new_nodeset + + when :parent + return internal_parse( path_stack, nodeset.collect{|n| n.parent}.compact ) + + when :ancestor + #puts "ANCESTOR" + new_nodeset = [] + for node in nodeset + while node.parent + node = node.parent + new_nodeset << node unless new_nodeset.include? node + end + end + #nodeset = new_nodeset.uniq + return new_nodeset + + when :ancestor_or_self + new_nodeset = [] + for node in nodeset + if node.node_type == :element + new_nodeset << node + while ( node.parent ) + node = node.parent + new_nodeset << node unless new_nodeset.includes? node + end + end + end + #nodeset = new_nodeset.uniq + return new_nodeset + + when :predicate + #puts "@"*80 + #puts "NODESET = #{nodeset.collect{|n|n.to_s}.inspect}" + predicate = path_stack.shift + new_nodeset = [] + Functions::size = nodeset.size + nodeset.size.times do |index| + node = nodeset[index] + Functions::node = node + Functions::index = index+1 + #puts "Node #{node} and index=#{index+1}" + result = Predicate( predicate, node ) + #puts "Predicate returned #{result} (#{result.type}) for #{node.type}" + if result.kind_of? Numeric + #puts "#{result} == #{index} => #{result == index}" + new_nodeset << node if result == (index+1) + elsif result.instance_of? Array + new_nodeset << node if result.size > 0 + else + new_nodeset << node if result + end + end + #puts "Nodeset after predicate #{predicate.inspect} has #{new_nodeset.size} nodes" + #puts "NODESET: #{new_nodeset.collect{|n|n.to_s}.inspect}" + return new_nodeset + + when :descendant_or_self + rv = descendant_or_self( path_stack, nodeset ) + path_stack.clear + return rv + + when :descendant + #puts ":DESCENDANT" + results = [] + for node in nodeset + results += internal_parse( path_stack.clone.unshift( :descendant_or_self ), + node.children ) if node.parent? + end + return results + + when :following_sibling + results = [] + for node in nodeset + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + following_siblings = all_siblings[ current_index+1 .. -1 ] + results += internal_parse( path_stack.clone, following_siblings ) + end + return results + + when :preceding_sibling + results = [] + for node in nodeset + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + preceding_siblings = all_siblings[ 0 .. current_index-1 ] + results += internal_parse( path_stack.clone, preceding_siblings ) + end + return results + + when :preceding + new_nodeset = [] + for node in nodeset + new_nodeset += preceding( node ) + end + return new_nodeset + + when :following + new_nodeset = [] + for node in nodeset + new_nodeset += following( node ) + end + return new_nodeset + + when :namespace + new_set = [] + for node in nodeset + new_nodeset << node.namespace if node.node_type == :element or node.node_type == :attribute + end + return new_nodeset + + when :variable + var_name = path_stack.shift + return @variables[ var_name ] + + end + nodeset + end + + ########################################################## + # The next two methods are BAD MOJO! + # This is my achilles heel. If anybody thinks of a better + # way of doing this, be my guest. This really sucks, but + # it took me three days to get it to work at all. + # ######################################################## + + def descendant_or_self( path_stack, nodeset ) + rs = [] + d_o_s( path_stack, nodeset, rs ) + #puts "RS = #{rs.collect{|n|n.to_s}.inspect}" + rs.flatten.compact + end + + def d_o_s( p, ns, r ) + #puts r.collect{|n|n.to_s}.inspect + #puts ns.collect{|n|n.to_s}.inspect + ns.each_index do |i| + n = ns[i] + x = match( p.clone, [ n ] ) + #puts "Got a match on #{p.inspect} for #{ns.collect{|n|n.to_s+"("+n.type.to_s+")"}.inspect}" + d_o_s( p, n.children, x ) if n.parent? + r[i,0] = [x] if x.size > 0 + end + end + + def recurse( nodeset, &block ) + for node in nodeset + yield node + recurse( node, &block ) if node.node_type == :element + end + end + + + # Given a predicate, a node, and a context, evaluates to true or false. + def Predicate( predicate, node ) + predicate = predicate.clone + #puts "#"*20 + #puts "Predicate( #{predicate.inspect}, #{node.type} )" + results = [] + case (predicate[0]) + when :and, :or, :eq, :neq, :lt, :lteq, :gt, :gteq + eq = predicate.shift + left = Predicate( predicate.shift, node ) + right = Predicate( predicate.shift, node ) + return equality_relational_compare( left, eq, right ) + + when :div, :mod, :mult, :plus, :minus, :union + op = predicate.shift + left = Predicate( predicate.shift, node ) + right = Predicate( predicate.shift, node ) + left = Functions::number( left ) + right = Functions::number( right ) + case op + when :div + return left.to_f / right.to_f + when :mod + return left % right + when :mult + return left * right + when :plus + return left + right + when :minus + return left - right + when :union + return (left | right) + end + + when :neg + predicate.shift + operand = Functions::number(Predicate( predicate, node )) + return -operand + + when :not + predicate.shift + return !Predicate( predicate.shift, node ) + + when :function + predicate.shift + func_name = predicate.shift.tr('-', '_') + arguments = predicate.shift + #puts "\nFUNCTION: #{func_name}" + #puts "ARGUMENTS: #{arguments.inspect} #{node.to_s}" + args = arguments.collect { |arg| Predicate( arg, node ) } + #puts "FUNCTION: #{func_name}( #{args.collect{|n|n.to_s}.inspect} )" + result = Functions.send( func_name, *args ) + #puts "RESULTS: #{result.inspect}" + return result + + else + return match( predicate, [ node ] ) + + end + end + + # Builds a nodeset of all of the following nodes of the supplied node, + # in document order + def following( node ) + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + following_siblings = all_siblings[ current_index+1 .. -1 ] + following = [] + recurse( following_siblings ) { |node| following << node } + following.shift + #puts "following is returning #{puta following}" + following + end + + # Builds a nodeset of all of the preceding nodes of the supplied node, + # in reverse document order + def preceding( node ) + all_siblings = node.parent.children + current_index = all_siblings.index( node ) + preceding_siblings = all_siblings[ 0 .. current_index-1 ] + + preceding_siblings.reverse! + preceding = [] + recurse( preceding_siblings ) { |node| preceding << node } + preceding.reverse + end + + def equality_relational_compare( set1, op, set2 ) + #puts "EQ_REL_COMP: #{set1.to_s}, #{op}, #{set2.to_s}" + if set1.kind_of? Array and set2.kind_of? Array + if set1.size == 1 and set2.size == 1 + set1 = set1[0] + set2 = set2[0] + else + set1.each do |i1| + i1 = i1.to_s + set2.each do |i2| + i2 = i2.to_s + return true if compare( i1, op, i2 ) + end + end + return false + end + end + #puts "COMPARING VALUES" + # If one is nodeset and other is number, compare number to each item + # in nodeset s.t. number op number(string(item)) + # If one is nodeset and other is string, compare string to each item + # in nodeset s.t. string op string(item) + # If one is nodeset and other is boolean, compare boolean to each item + # in nodeset s.t. boolean op boolean(item) + if set1.kind_of? Array or set2.kind_of? Array + #puts "ISA ARRAY" + if set1.kind_of? Array + a = set1 + b = set2.to_s + else + a = set2 + b = set1.to_s + end + + case b + when 'true', 'false' + b = Functions::boolean( b ) + for v in a + v = Functions::boolean(v) + return true if compare( v, op, b ) + end + when /^\d+(\.\d+)?$/ + b = Functions::number( b ) + for v in a + v = Functions::number(v) + return true if compare( v, op, b ) + end + else + b = Functions::string( b ) + for v in a + v = Functions::string(v) + return true if compare( v, op, b ) + end + end + else + # If neither is nodeset, + # If op is = or != + # If either boolean, convert to boolean + # If either number, convert to number + # Else, convert to string + # Else + # Convert both to numbers and compare + s1 = set1.to_s + s2 = set2.to_s + #puts "EQ_REL_COMP: #{set1}=>#{s1}, #{set2}=>#{s2}" + if s1 == 'true' or s1 == 'false' or s2 == 'true' or s2 == 'false' + #puts "Functions::boolean(#{set1})=>#{Functions::boolean(set1)}" + #puts "Functions::boolean(#{set2})=>#{Functions::boolean(set2)}" + set1 = Functions::boolean( set1 ) + set2 = Functions::boolean( set2 ) + else + if op == :eq or op == :neq + if s1 =~ /^\d+(\.\d+)?$/ or s2 =~ /^\d+(\.\d+)?$/ + set1 = Functions::number( s1 ) + set2 = Functions::number( s2 ) + else + set1 = Functions::string( set1 ) + set2 = Functions::string( set2 ) + end + else + set1 = Functions::number( set1 ) + set2 = Functions::number( set2 ) + end + end + #puts "EQ_REL_COMP: #{set1} #{op} #{set2}" + return compare( set1, op, set2 ) + end + return false + end + + def compare a, op, b + case op + when :eq + a == b + when :neq + a != b + when :lt + a < b + when :lteq + a <= b + when :gt + a > b + when :gteq + a >= b + when :and + a and b + when :or + a or b + else + false + end + end + end +end -- cgit v1.2.3