1 files changed, 0 insertions, 530 deletions
diff --git a/trunk/lib/rexml/parsers/baseparser.rb b/trunk/lib/rexml/parsers/baseparser.rb
deleted file mode 100644
index 162d029a62..0000000000
--- a/trunk/lib/rexml/parsers/baseparser.rb
+++ /dev/null
@@ -1,530 +0,0 @@
-require 'rexml/parseexception'
-require 'rexml/undefinednamespaceexception'
-require 'rexml/source'
-require 'set'
-
-module REXML
-  module Parsers
-    # = Using the Pull Parser
-    # <em>This API is experimental, and subject to change.</em>
-    #  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
-    #  while parser.has_next?
-    #    res = parser.next
-    #    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
-    #  end
-    # See the PullEvent class for information on the content of the results.
-    # The data is identical to the arguments passed for the various events to
-    # the StreamListener API.
-    #
-    # Notice that:
-    #  parser = PullParser.new( "<a>BAD DOCUMENT" )
-    #  while parser.has_next?
-    #    res = parser.next
-    #    raise res[1] if res.error?
-    #  end
-    #
-    # Nat Price gave me some good ideas for the API.
-    class BaseParser
-      if String.method_defined? :encode
-        # Oniguruma / POSIX [understands unicode]
-        LETTER = '[[:alpha:]]'
-        DIGIT = '[[:digit:]]'
-      else
-        # Ruby < 1.9 [doesn't understand unicode]
-        LETTER = 'a-zA-Z'
-        DIGIT = '\d'
-      end
-
-      COMBININGCHAR = '' # TODO
-      EXTENDER = ''      # TODO
-
-      NCNAME_STR= "[#{LETTER}_:][-#{LETTER}#{DIGIT}._:#{COMBININGCHAR}#{EXTENDER}]*"
-      NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
-      UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
-
-      NAMECHAR = '[\-\w\d\.:]'
-      NAME = "([\\w:]#{NAMECHAR}*)"
-      NMTOKEN = "(?:#{NAMECHAR})+"
-      NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
-      REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
-      REFERENCE_RE = /#{REFERENCE}/
-
-      DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
-      DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
-      ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
-      COMMENT_START = /\A<!--/u
-      COMMENT_PATTERN = /<!--(.*?)-->/um
-      CDATA_START = /\A<!\[CDATA\[/u
-      CDATA_END = /^\s*\]\s*>/um
-      CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
-      XMLDECL_START = /\A<\?xml\s/u;
-      XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
-      INSTRUCTION_START = /\A<\?/u
-      INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
-      TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
-      CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
-
-      VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
-      ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
-      STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
-
-      ENTITY_START = /^\s*<!ENTITY/
-      IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
-      ELEMENTDECL_START = /^\s*<!ELEMENT/um
-      ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
-      SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
-      ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
-      NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
-      ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
-      ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
-      ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
-      DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
-      ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
-      ATTDEF_RE = /#{ATTDEF}/
-      ATTLISTDECL_START = /^\s*<!ATTLIST/um
-      ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
-      NOTATIONDECL_START = /^\s*<!NOTATION/um
-      PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
-      SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
-
-      TEXT_PATTERN = /\A([^<]*)/um
-
-      # Entity constants
-      PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
-      SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
-      PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
-      EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
-      NDATADECL = "\\s+NDATA\\s+#{NAME}"
-      PEREFERENCE = "%#{NAME};"
-      ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
-      PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
-      ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
-      PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
-      GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
-
-      EREFERENCE = /&(?!#{NAME};)/
-
-      DEFAULT_ENTITIES = { 
-        'gt' => [/&gt;/, '&gt;', '>', />/], 
-        'lt' => [/&lt;/, '&lt;', '<', /</], 
-        'quot' => [/&quot;/, '&quot;', '"', /"/], 
-        "apos" => [/&apos;/, "&apos;", "'", /'/] 
-      }
-
-
-      ######################################################################
-      # These are patterns to identify common markup errors, to make the
-      # error messages more informative.
-      ######################################################################
-      MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
-
-      def initialize( source )
-        self.stream = source
-      end
-
-      def add_listener( listener )
-        if !defined?(@listeners) or !@listeners
-          @listeners = []
-          instance_eval <<-EOL
-            alias :_old_pull :pull
-            def pull
-              event = _old_pull
-              @listeners.each do |listener|
-                listener.receive event
-              end
-              event
-            end
-          EOL
-        end
-        @listeners << listener
-      end
-
-      attr_reader :source
-
-      def stream=( source )
-        @source = SourceFactory.create_from( source )
-        @closed = nil
-        @document_status = nil
-        @tags = []
-        @stack = []
-        @entities = []
-        @nsstack = []
-      end
-
-      def position
-        if @source.respond_to? :position
-          @source.position
-        else
-          # FIXME
-          0
-        end
-      end
-
-      # Returns true if there are no more events
-      def empty?
-        return (@source.empty? and @stack.empty?)
-      end
-
-      # Returns true if there are more events.  Synonymous with !empty?
-      def has_next?
-        return !(@source.empty? and @stack.empty?)
-      end
-
-      # Push an event back on the head of the stream.  This method
-      # has (theoretically) infinite depth.
-      def unshift token
-        @stack.unshift(token)
-      end
-
-      # Peek at the +depth+ event in the stack.  The first element on the stack
-      # is at depth 0.  If +depth+ is -1, will parse to the end of the input
-      # stream and return the last event, which is always :end_document.
-      # Be aware that this causes the stream to be parsed up to the +depth+ 
-      # event, so you can effectively pre-parse the entire document (pull the 
-      # entire thing into memory) using this method.  
-      def peek depth=0
-        raise %Q[Illegal argument "#{depth}"] if depth < -1
-        temp = []
-        if depth == -1
-          temp.push(pull()) until empty?
-        else
-          while @stack.size+temp.size < depth+1
-            temp.push(pull())
-          end
-        end
-        @stack += temp if temp.size > 0
-        @stack[depth]
-      end
-
-      # Returns the next event.  This is a +PullEvent+ object.
-      def pull
-        if @closed
-          x, @closed = @closed, nil
-          return [ :end_element, x ]
-        end
-        return [ :end_document ] if empty?
-        return @stack.shift if @stack.size > 0
-        #STDERR.puts @source.encoding
-        @source.read if @source.buffer.size<2
-        #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
-        if @document_status == nil
-          #@source.consume( /^\s*/um )
-          word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
-          word = word[1] unless word.nil?
-          #STDERR.puts "WORD = #{word.inspect}"
-          case word
-          when COMMENT_START
-            return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
-          when XMLDECL_START
-            #STDERR.puts "XMLDECL"
-            results = @source.match( XMLDECL_PATTERN, true )[1]
-            version = VERSION.match( results )
-            version = version[1] unless version.nil?
-            encoding = ENCODING.match(results)
-            encoding = encoding[1] unless encoding.nil?
-            @source.encoding = encoding
-            standalone = STANDALONE.match(results)
-            standalone = standalone[1] unless standalone.nil?
-            return [ :xmldecl, version, encoding, standalone ]
-          when INSTRUCTION_START
-            return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
-          when DOCTYPE_START
-            md = @source.match( DOCTYPE_PATTERN, true )
-            @nsstack.unshift(curr_ns=Set.new)
-            identity = md[1]
-            close = md[2]
-            identity =~ IDENTITY
-            name = $1
-            raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
-            pub_sys = $2.nil? ? nil : $2.strip
-            long_name = $4.nil? ? nil : $4.strip
-            uri = $6.nil? ? nil : $6.strip
-            args = [ :start_doctype, name, pub_sys, long_name, uri ]
-            if close == ">"
-              @document_status = :after_doctype
-              @source.read if @source.buffer.size<2
-              md = @source.match(/^\s*/um, true)
-              @stack << [ :end_doctype ]
-            else
-              @document_status = :in_doctype
-            end
-            return args
-          when /^\s+/
-          else
-            @document_status = :after_doctype
-            @source.read if @source.buffer.size<2
-            md = @source.match(/\s*/um, true)
-            if @source.encoding == "UTF-8"
-              if @source.buffer.respond_to? :force_encoding
-                @source.buffer.force_encoding(Encoding::UTF_8)
-              end
-            end
-          end
-        end
-        if @document_status == :in_doctype
-          md = @source.match(/\s*(.*?>)/um)
-          case md[1]
-          when SYSTEMENTITY 
-            match = @source.match( SYSTEMENTITY, true )[1]
-            return [ :externalentity, match ]
-
-          when ELEMENTDECL_START
-            return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
-
-          when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
-            match[0] = :entitydecl
-            ref = false
-            if match[1] == '%'
-              ref = true
-              match.delete_at 1
-            end
-            # Now we have to sort out what kind of entity reference this is
-            if match[2] == 'SYSTEM'
-              # External reference
-              match[3] = match[3][1..-2] # PUBID
-              match.delete_at(4) if match.size > 4 # Chop out NDATA decl
-              # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
-            elsif match[2] == 'PUBLIC'
-              # External reference
-              match[3] = match[3][1..-2] # PUBID
-              match[4] = match[4][1..-2] # HREF
-              # match is [ :entity, name, PUBLIC, pubid, href ]
-            else
-              match[2] = match[2][1..-2]
-              match.pop if match.size == 4
-              # match is [ :entity, name, value ]
-            end
-            match << '%' if ref
-            return match
-          when ATTLISTDECL_START
-            md = @source.match( ATTLISTDECL_PATTERN, true )
-            raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
-            element = md[1]
-            contents = md[0]
-
-            pairs = {}
-            values = md[0].scan( ATTDEF_RE )
-            values.each do |attdef|
-              unless attdef[3] == "#IMPLIED"
-                attdef.compact!
-                val = attdef[3]
-                val = attdef[4] if val == "#FIXED "
-                pairs[attdef[0]] = val
-                if attdef[0] =~ /^xmlns:(.*)/
-                  @nsstack[0] << $1
-                end
-              end
-            end
-            return [ :attlistdecl, element, pairs, contents ]
-          when NOTATIONDECL_START
-            md = nil
-            if @source.match( PUBLIC )
-              md = @source.match( PUBLIC, true )
-              vals = [md[1],md[2],md[4],md[6]]
-            elsif @source.match( SYSTEM )
-              md = @source.match( SYSTEM, true )
-              vals = [md[1],md[2],nil,md[4]]
-            else
-              raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
-            end
-            return [ :notationdecl, *vals ]
-          when CDATA_END
-            @document_status = :after_doctype
-            @source.match( CDATA_END, true )
-            return [ :end_doctype ]
-          end
-        end
-        begin
-          if @source.buffer[0] == ?<
-            if @source.buffer[1] == ?/
-              @nsstack.shift
-              last_tag = @tags.pop
-              #md = @source.match_to_consume( '>', CLOSE_MATCH)
-              md = @source.match( CLOSE_MATCH, true )
-              raise REXML::ParseException.new( "Missing end tag for "+
-                "'#{last_tag}' (got \"#{md[1]}\")", 
-                @source) unless last_tag == md[1]
-              return [ :end_element, last_tag ]
-            elsif @source.buffer[1] == ?!
-              md = @source.match(/\A(\s*[^>]*>)/um)
-              #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
-              raise REXML::ParseException.new("Malformed node", @source) unless md
-              if md[0][2] == ?-
-                md = @source.match( COMMENT_PATTERN, true )
-
-                case md[1]
-                when /--/, /-$/
-                  raise REXML::ParseException.new("Malformed comment", @source)
-                end
-
-                return [ :comment, md[1] ] if md
-              else
-                md = @source.match( CDATA_PATTERN, true )
-                return [ :cdata, md[1] ] if md
-              end
-              raise REXML::ParseException.new( "Declarations can only occur "+
-                "in the doctype declaration.", @source)
-            elsif @source.buffer[1] == ??
-              md = @source.match( INSTRUCTION_PATTERN, true )
-              return [ :processing_instruction, md[1], md[2] ] if md
-              raise REXML::ParseException.new( "Bad instruction declaration",
-                @source)
-            else
-              # Get the next tag
-              md = @source.match(TAG_MATCH, true)
-              unless md
-                # Check for missing attribute quotes
-                raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
-                raise REXML::ParseException.new("malformed XML: missing tag start", @source) 
-              end
-              attributes = {}
-              prefixes = Set.new
-              prefixes << md[2] if md[2]
-              @nsstack.unshift(curr_ns=Set.new)
-              if md[4].size > 0
-                attrs = md[4].scan( ATTRIBUTE_PATTERN )
-                raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
-                attrs.each { |a,b,c,d,e| 
-                  if b == "xmlns"
-                    if c == "xml"
-                      if d != "http://www.w3.org/XML/1998/namespace"
-                        msg = "The 'xml' prefix must not be bound to any other namespace "+
-                        "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
-                        raise REXML::ParseException.new( msg, @source, self )
-                      end
-                    elsif c == "xmlns"
-                      msg = "The 'xmlns' prefix must not be declared "+
-                      "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
-                      raise REXML::ParseException.new( msg, @source, self)
-                    end
-                    curr_ns << c
-                  elsif b
-                    prefixes << b unless b == "xml"
-                  end
-
-                  if attributes.has_key? a
-                    msg = "Duplicate attribute #{a.inspect}"
-                    raise REXML::ParseException.new( msg, @source, self)
-                  end
-
-                  attributes[a] = e 
-                }
-              end
-        
-              # Verify that all of the prefixes have been defined
-              for prefix in prefixes
-                unless @nsstack.find{|k| k.member?(prefix)}
-                  raise UndefinedNamespaceException.new(prefix,@source,self)
-                end
-              end
-
-              if md[6]
-                @closed = md[1]
-                @nsstack.shift
-              else
-                @tags.push( md[1] )
-              end
-              return [ :start_element, md[1], attributes ]
-            end
-          else
-            md = @source.match( TEXT_PATTERN, true )
-            if md[0].length == 0
-              @source.match( /(\s+)/, true )
-            end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
-          end
-        rescue REXML::UndefinedNamespaceException
-          raise
-        rescue REXML::ParseException
-          raise
-        rescue Exception, NameError => error
-          raise REXML::ParseException.new( "Exception parsing",
-            @source, self, (error ? error : $!) )
-        end
-        return [ :dummy ]
-      end
-
-      def entity( reference, entities )
-        value = nil
-        value = entities[ reference ] if entities
-        if not value
-          value = DEFAULT_ENTITIES[ reference ]
-          value = value[2] if value
-        end
-        unnormalize( value, entities ) if value
-      end
-
-      # Escapes all possible entities
-      def normalize( input, entities=nil, entity_filter=nil )
-        copy = input.clone
-        # Doing it like this rather than in a loop improves the speed
-        copy.gsub!( EREFERENCE, '&amp;' )
-        entities.each do |key, value|
-          copy.gsub!( value, "&#{key};" ) unless entity_filter and 
-                                      entity_filter.include?(entity)
-        end if entities
-        copy.gsub!( EREFERENCE, '&amp;' )
-        DEFAULT_ENTITIES.each do |key, value|
-          copy.gsub!( value[3], value[1] )
-        end
-        copy
-      end
-
-      # Unescapes all possible entities
-      def unnormalize( string, entities=nil, filter=nil )
-        rv = string.clone
-        rv.gsub!( /\r\n?/, "\n" )
-        matches = rv.scan( REFERENCE_RE )
-        return rv if matches.size == 0
-        rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {
-          m=$1
-          m = "0#{m}" if m[0] == ?x
-          [Integer(m)].pack('U*')
-        }
-        matches.collect!{|x|x[0]}.compact!
-        if matches.size > 0
-          matches.each do |entity_reference|
-            unless filter and filter.include?(entity_reference)
-              entity_value = entity( entity_reference, entities )
-              if entity_value
-                re = /&#{entity_reference};/
-                rv.gsub!( re, entity_value )
-              else
-                er = DEFAULT_ENTITIES[entity_reference]
-                rv.gsub!( er[0], er[2] ) if er
-              end
-            end
-          end
-          rv.gsub!( /&amp;/, '&' )
-        end
-        rv
-      end
-    end
-  end
-end
-
-=begin
-  case event[0]
-  when :start_element
-  when :text
-  when :end_element
-  when :processing_instruction
-  when :cdata
-  when :comment
-  when :xmldecl
-  when :start_doctype
-  when :end_doctype
-  when :externalentity
-  when :elementdecl
-  when :entity
-  when :attlistdecl
-  when :notationdecl
-  when :end_doctype
-  end
-=end