summaryrefslogtreecommitdiff
path: root/ruby_1_8_6/lib/rexml/parsers/baseparser.rb
diff options
context:
space:
mode:
Diffstat (limited to 'ruby_1_8_6/lib/rexml/parsers/baseparser.rb')
-rw-r--r--ruby_1_8_6/lib/rexml/parsers/baseparser.rb503
1 files changed, 0 insertions, 503 deletions
diff --git a/ruby_1_8_6/lib/rexml/parsers/baseparser.rb b/ruby_1_8_6/lib/rexml/parsers/baseparser.rb
deleted file mode 100644
index fc2354a67f..0000000000
--- a/ruby_1_8_6/lib/rexml/parsers/baseparser.rb
+++ /dev/null
@@ -1,503 +0,0 @@
-require 'rexml/parseexception'
-require 'rexml/undefinednamespaceexception'
-require 'rexml/source'
-require 'set'
-
-module REXML
- module Parsers
- # = Using the Pull Parser
- # <em>This API is experimental, and subject to change.</em>
- # parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
- # while parser.has_next?
- # res = parser.next
- # puts res[1]['att'] if res.start_tag? and res[0] == 'b'
- # end
- # See the PullEvent class for information on the content of the results.
- # The data is identical to the arguments passed for the various events to
- # the StreamListener API.
- #
- # Notice that:
- # parser = PullParser.new( "<a>BAD DOCUMENT" )
- # while parser.has_next?
- # res = parser.next
- # raise res[1] if res.error?
- # end
- #
- # Nat Price gave me some good ideas for the API.
- class BaseParser
- NCNAME_STR= '[\w:][\-\w\d.]*'
- NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
- UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
-
- NAMECHAR = '[\-\w\d\.:]'
- NAME = "([\\w:]#{NAMECHAR}*)"
- NMTOKEN = "(?:#{NAMECHAR})+"
- NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
- REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
- REFERENCE_RE = /#{REFERENCE}/
-
- DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
- DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um
- ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um
- COMMENT_START = /\A<!--/u
- COMMENT_PATTERN = /<!--(.*?)-->/um
- CDATA_START = /\A<!\[CDATA\[/u
- CDATA_END = /^\s*\]\s*>/um
- CDATA_PATTERN = /<!\[CDATA\[(.*?)\]\]>/um
- XMLDECL_START = /\A<\?xml\s/u;
- XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um
- INSTRUCTION_START = /\A<\?/u
- INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um
- TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um
- CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um
-
- VERSION = /\bversion\s*=\s*["'](.*?)['"]/um
- ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um
- STANDALONE = /\bstandalone\s*=\s["'](.*?)['"]/um
-
- ENTITY_START = /^\s*<!ENTITY/
- IDENTITY = /^([!\*\w\-]+)(\s+#{NCNAME_STR})?(\s+["'](.*?)['"])?(\s+['"](.*?)["'])?/u
- ELEMENTDECL_START = /^\s*<!ELEMENT/um
- ELEMENTDECL_PATTERN = /^\s*(<!ELEMENT.*?)>/um
- SYSTEMENTITY = /^\s*(%.*?;)\s*$/um
- ENUMERATION = "\\(\\s*#{NMTOKEN}(?:\\s*\\|\\s*#{NMTOKEN})*\\s*\\)"
- NOTATIONTYPE = "NOTATION\\s+\\(\\s*#{NAME}(?:\\s*\\|\\s*#{NAME})*\\s*\\)"
- ENUMERATEDTYPE = "(?:(?:#{NOTATIONTYPE})|(?:#{ENUMERATION}))"
- ATTTYPE = "(CDATA|ID|IDREF|IDREFS|ENTITY|ENTITIES|NMTOKEN|NMTOKENS|#{ENUMERATEDTYPE})"
- ATTVALUE = "(?:\"((?:[^<&\"]|#{REFERENCE})*)\")|(?:'((?:[^<&']|#{REFERENCE})*)')"
- DEFAULTDECL = "(#REQUIRED|#IMPLIED|(?:(#FIXED\\s+)?#{ATTVALUE}))"
- ATTDEF = "\\s+#{NAME}\\s+#{ATTTYPE}\\s+#{DEFAULTDECL}"
- ATTDEF_RE = /#{ATTDEF}/
- ATTLISTDECL_START = /^\s*<!ATTLIST/um
- ATTLISTDECL_PATTERN = /^\s*<!ATTLIST\s+#{NAME}(?:#{ATTDEF})*\s*>/um
- NOTATIONDECL_START = /^\s*<!NOTATION/um
- PUBLIC = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(PUBLIC)\s+(["'])(.*?)\3(?:\s+(["'])(.*?)\5)?\s*>/um
- SYSTEM = /^\s*<!NOTATION\s+(\w[\-\w]*)\s+(SYSTEM)\s+(["'])(.*?)\3\s*>/um
-
- TEXT_PATTERN = /\A([^<]*)/um
-
- # Entity constants
- PUBIDCHAR = "\x20\x0D\x0Aa-zA-Z0-9\\-()+,./:=?;!*@$_%#"
- SYSTEMLITERAL = %Q{((?:"[^"]*")|(?:'[^']*'))}
- PUBIDLITERAL = %Q{("[#{PUBIDCHAR}']*"|'[#{PUBIDCHAR}]*')}
- EXTERNALID = "(?:(?:(SYSTEM)\\s+#{SYSTEMLITERAL})|(?:(PUBLIC)\\s+#{PUBIDLITERAL}\\s+#{SYSTEMLITERAL}))"
- NDATADECL = "\\s+NDATA\\s+#{NAME}"
- PEREFERENCE = "%#{NAME};"
- ENTITYVALUE = %Q{((?:"(?:[^%&"]|#{PEREFERENCE}|#{REFERENCE})*")|(?:'([^%&']|#{PEREFERENCE}|#{REFERENCE})*'))}
- PEDEF = "(?:#{ENTITYVALUE}|#{EXTERNALID})"
- ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
- PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
- GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
-
- EREFERENCE = /&(?!#{NAME};)/
-
- DEFAULT_ENTITIES = {
- 'gt' => [/&gt;/, '&gt;', '>', />/],
- 'lt' => [/&lt;/, '&lt;', '<', /</],
- 'quot' => [/&quot;/, '&quot;', '"', /"/],
- "apos" => [/&apos;/, "&apos;", "'", /'/]
- }
-
-
- ######################################################################
- # These are patterns to identify common markup errors, to make the
- # error messages more informative.
- ######################################################################
- MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um
-
- def initialize( source )
- self.stream = source
- end
-
- def add_listener( listener )
- if !defined?(@listeners) or !@listeners
- @listeners = []
- instance_eval <<-EOL
- alias :_old_pull :pull
- def pull
- event = _old_pull
- @listeners.each do |listener|
- listener.receive event
- end
- event
- end
- EOL
- end
- @listeners << listener
- end
-
- attr_reader :source
-
- def stream=( source )
- @source = SourceFactory.create_from( source )
- @closed = nil
- @document_status = nil
- @tags = []
- @stack = []
- @entities = []
- @nsstack = []
- end
-
- def position
- if @source.respond_to? :position
- @source.position
- else
- # FIXME
- 0
- end
- end
-
- # Returns true if there are no more events
- def empty?
- return (@source.empty? and @stack.empty?)
- end
-
- # Returns true if there are more events. Synonymous with !empty?
- def has_next?
- return !(@source.empty? and @stack.empty?)
- end
-
- # Push an event back on the head of the stream. This method
- # has (theoretically) infinite depth.
- def unshift token
- @stack.unshift(token)
- end
-
- # Peek at the +depth+ event in the stack. The first element on the stack
- # is at depth 0. If +depth+ is -1, will parse to the end of the input
- # stream and return the last event, which is always :end_document.
- # Be aware that this causes the stream to be parsed up to the +depth+
- # event, so you can effectively pre-parse the entire document (pull the
- # entire thing into memory) using this method.
- def peek depth=0
- raise %Q[Illegal argument "#{depth}"] if depth < -1
- temp = []
- if depth == -1
- temp.push(pull()) until empty?
- else
- while @stack.size+temp.size < depth+1
- temp.push(pull())
- end
- end
- @stack += temp if temp.size > 0
- @stack[depth]
- end
-
- # Returns the next event. This is a +PullEvent+ object.
- def pull
- if @closed
- x, @closed = @closed, nil
- return [ :end_element, x ]
- end
- return [ :end_document ] if empty?
- return @stack.shift if @stack.size > 0
- #STDERR.puts @source.encoding
- @source.read if @source.buffer.size<2
- #STDERR.puts "BUFFER = #{@source.buffer.inspect}"
- if @document_status == nil
- #@source.consume( /^\s*/um )
- word = @source.match( /^((?:\s+)|(?:<[^>]*>))/um )
- word = word[1] unless word.nil?
- #STDERR.puts "WORD = #{word.inspect}"
- case word
- when COMMENT_START
- return [ :comment, @source.match( COMMENT_PATTERN, true )[1] ]
- when XMLDECL_START
- #STDERR.puts "XMLDECL"
- results = @source.match( XMLDECL_PATTERN, true )[1]
- version = VERSION.match( results )
- version = version[1] unless version.nil?
- encoding = ENCODING.match(results)
- encoding = encoding[1] unless encoding.nil?
- @source.encoding = encoding
- standalone = STANDALONE.match(results)
- standalone = standalone[1] unless standalone.nil?
- return [ :xmldecl, version, encoding, standalone ]
- when INSTRUCTION_START
- return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ]
- when DOCTYPE_START
- md = @source.match( DOCTYPE_PATTERN, true )
- @nsstack.unshift(curr_ns=Set.new)
- identity = md[1]
- close = md[2]
- identity =~ IDENTITY
- name = $1
- raise REXML::ParseException.new("DOCTYPE is missing a name") if name.nil?
- pub_sys = $2.nil? ? nil : $2.strip
- long_name = $4.nil? ? nil : $4.strip
- uri = $6.nil? ? nil : $6.strip
- args = [ :start_doctype, name, pub_sys, long_name, uri ]
- if close == ">"
- @document_status = :after_doctype
- @source.read if @source.buffer.size<2
- md = @source.match(/^\s*/um, true)
- @stack << [ :end_doctype ]
- else
- @document_status = :in_doctype
- end
- return args
- when /^\s+/
- else
- @document_status = :after_doctype
- @source.read if @source.buffer.size<2
- md = @source.match(/\s*/um, true)
- end
- end
- if @document_status == :in_doctype
- md = @source.match(/\s*(.*?>)/um)
- case md[1]
- when SYSTEMENTITY
- match = @source.match( SYSTEMENTITY, true )[1]
- return [ :externalentity, match ]
-
- when ELEMENTDECL_START
- return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
-
- when ENTITY_START
- match = @source.match( ENTITYDECL, true ).to_a.compact
- match[0] = :entitydecl
- ref = false
- if match[1] == '%'
- ref = true
- match.delete_at 1
- end
- # Now we have to sort out what kind of entity reference this is
- if match[2] == 'SYSTEM'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match.delete_at(4) if match.size > 4 # Chop out NDATA decl
- # match is [ :entity, name, SYSTEM, pubid(, ndata)? ]
- elsif match[2] == 'PUBLIC'
- # External reference
- match[3] = match[3][1..-2] # PUBID
- match[4] = match[4][1..-2] # HREF
- # match is [ :entity, name, PUBLIC, pubid, href ]
- else
- match[2] = match[2][1..-2]
- match.pop if match.size == 4
- # match is [ :entity, name, value ]
- end
- match << '%' if ref
- return match
- when ATTLISTDECL_START
- md = @source.match( ATTLISTDECL_PATTERN, true )
- raise REXML::ParseException.new( "Bad ATTLIST declaration!", @source ) if md.nil?
- element = md[1]
- contents = md[0]
-
- pairs = {}
- values = md[0].scan( ATTDEF_RE )
- values.each do |attdef|
- unless attdef[3] == "#IMPLIED"
- attdef.compact!
- val = attdef[3]
- val = attdef[4] if val == "#FIXED "
- pairs[attdef[0]] = val
- if attdef[0] =~ /^xmlns:(.*)/
- @nsstack[0] << $1
- end
- end
- end
- return [ :attlistdecl, element, pairs, contents ]
- when NOTATIONDECL_START
- md = nil
- if @source.match( PUBLIC )
- md = @source.match( PUBLIC, true )
- vals = [md[1],md[2],md[4],md[6]]
- elsif @source.match( SYSTEM )
- md = @source.match( SYSTEM, true )
- vals = [md[1],md[2],nil,md[4]]
- else
- raise REXML::ParseException.new( "error parsing notation: no matching pattern", @source )
- end
- return [ :notationdecl, *vals ]
- when CDATA_END
- @document_status = :after_doctype
- @source.match( CDATA_END, true )
- return [ :end_doctype ]
- end
- end
- begin
- if @source.buffer[0] == ?<
- if @source.buffer[1] == ?/
- @nsstack.shift
- last_tag = @tags.pop
- #md = @source.match_to_consume( '>', CLOSE_MATCH)
- md = @source.match( CLOSE_MATCH, true )
- raise REXML::ParseException.new( "Missing end tag for "+
- "'#{last_tag}' (got \"#{md[1]}\")",
- @source) unless last_tag == md[1]
- return [ :end_element, last_tag ]
- elsif @source.buffer[1] == ?!
- md = @source.match(/\A(\s*[^>]*>)/um)
- #STDERR.puts "SOURCE BUFFER = #{source.buffer}, #{source.buffer.size}"
- raise REXML::ParseException.new("Malformed node", @source) unless md
- if md[0][2] == ?-
- md = @source.match( COMMENT_PATTERN, true )
- return [ :comment, md[1] ] if md
- else
- md = @source.match( CDATA_PATTERN, true )
- return [ :cdata, md[1] ] if md
- end
- raise REXML::ParseException.new( "Declarations can only occur "+
- "in the doctype declaration.", @source)
- elsif @source.buffer[1] == ??
- md = @source.match( INSTRUCTION_PATTERN, true )
- return [ :processing_instruction, md[1], md[2] ] if md
- raise REXML::ParseException.new( "Bad instruction declaration",
- @source)
- else
- # Get the next tag
- md = @source.match(TAG_MATCH, true)
- unless md
- # Check for missing attribute quotes
- raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES )
- raise REXML::ParseException.new("malformed XML: missing tag start", @source)
- end
- attributes = {}
- prefixes = Set.new
- prefixes << md[2] if md[2]
- @nsstack.unshift(curr_ns=Set.new)
- if md[4].size > 0
- attrs = md[4].scan( ATTRIBUTE_PATTERN )
- raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0
- attrs.each { |a,b,c,d,e|
- if b == "xmlns"
- if c == "xml"
- if d != "http://www.w3.org/XML/1998/namespace"
- msg = "The 'xml' prefix must not be bound to any other namespace "+
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
- raise REXML::ParseException.new( msg, @source, self )
- end
- elsif c == "xmlns"
- msg = "The 'xmlns' prefix must not be declared "+
- "(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
- raise REXML::ParseException.new( msg, @source, self)
- end
- curr_ns << c
- elsif b
- prefixes << b unless b == "xml"
- end
- attributes[a] = e
- }
- end
-
- # Verify that all of the prefixes have been defined
- for prefix in prefixes
- unless @nsstack.find{|k| k.member?(prefix)}
- raise UndefinedNamespaceException.new(prefix,@source,self)
- end
- end
-
- if md[6]
- @closed = md[1]
- @nsstack.shift
- else
- @tags.push( md[1] )
- end
- return [ :start_element, md[1], attributes ]
- end
- else
- md = @source.match( TEXT_PATTERN, true )
- if md[0].length == 0
- @source.match( /(\s+)/, true )
- end
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
- #return [ :text, "" ] if md[0].length == 0
- # unnormalized = Text::unnormalize( md[1], self )
- # return PullEvent.new( :text, md[1], unnormalized )
- return [ :text, md[1] ]
- end
- rescue REXML::UndefinedNamespaceException
- raise
- rescue REXML::ParseException
- raise
- rescue Exception, NameError => error
- raise REXML::ParseException.new( "Exception parsing",
- @source, self, (error ? error : $!) )
- end
- return [ :dummy ]
- end
-
- def entity( reference, entities )
- value = nil
- value = entities[ reference ] if entities
- if not value
- value = DEFAULT_ENTITIES[ reference ]
- value = value[2] if value
- end
- unnormalize( value, entities ) if value
- end
-
- # Escapes all possible entities
- def normalize( input, entities=nil, entity_filter=nil )
- copy = input.clone
- # Doing it like this rather than in a loop improves the speed
- copy.gsub!( EREFERENCE, '&amp;' )
- entities.each do |key, value|
- copy.gsub!( value, "&#{key};" ) unless entity_filter and
- entity_filter.include?(entity)
- end if entities
- copy.gsub!( EREFERENCE, '&amp;' )
- DEFAULT_ENTITIES.each do |key, value|
- copy.gsub!( value[3], value[1] )
- end
- copy
- end
-
- # Unescapes all possible entities
- def unnormalize( string, entities=nil, filter=nil )
- rv = string.clone
- rv.gsub!( /\r\n?/, "\n" )
- matches = rv.scan( REFERENCE_RE )
- return rv if matches.size == 0
- rv.gsub!( /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/ ) {|m|
- m=$1
- m = "0#{m}" if m[0] == ?x
- [Integer(m)].pack('U*')
- }
- matches.collect!{|x|x[0]}.compact!
- if matches.size > 0
- matches.each do |entity_reference|
- unless filter and filter.include?(entity_reference)
- entity_value = entity( entity_reference, entities )
- if entity_value
- re = /&#{entity_reference};/
- rv.gsub!( re, entity_value )
- end
- end
- end
- matches.each do |entity_reference|
- unless filter and filter.include?(entity_reference)
- er = DEFAULT_ENTITIES[entity_reference]
- rv.gsub!( er[0], er[2] ) if er
- end
- end
- rv.gsub!( /&amp;/, '&' )
- end
- rv
- end
- end
- end
-end
-
-=begin
- case event[0]
- when :start_element
- when :text
- when :end_element
- when :processing_instruction
- when :cdata
- when :comment
- when :xmldecl
- when :start_doctype
- when :end_doctype
- when :externalentity
- when :elementdecl
- when :entity
- when :attlistdecl
- when :notationdecl
- when :end_doctype
- end
-=end