require 'rexml/parseexception' require 'rexml/parsers/baseparser' require 'rexml/xmltokens' module REXML module Parsers # = Using the Pull Parser # This API is experimental, and subject to change. # parser = PullParser.new( "texttxet" ) # while parser.has_next? # res = parser.next # puts res[1]['att'] if res.start_tag? and res[0] == 'b' # end # See the PullEvent class for information on the content of the results. # The data is identical to the arguments passed for the various events to # the StreamListener API. # # Notice that: # parser = PullParser.new( "BAD DOCUMENT" ) # while parser.has_next? # res = parser.next # raise res[1] if res.error? # end # # Nat Price gave me some good ideas for the API. class PullParser < BaseParser include XMLTokens def initialize stream super @entities = {} @listeners = nil end def add_listener( listener ) @listeners = [] unless @listeners @listeners << listener end def each while has_next? yield self.pull end end def peek depth=0 PullEvent.new(super) end def pull event = super case event[0] when :entitydecl @entities[ event[1] ] = event[2] unless event[2] =~ /PUBLIC|SYSTEM/ when :text unnormalized = unnormalize( event[1], @entities ) event << unnormalized end PullEvent.new( event ) end end # A parsing event. The contents of the event are accessed as an +Array?, # and the type is given either by the ...? methods, or by accessing the # +type+ accessor. The contents of this object vary from event to event, # but are identical to the arguments passed to +StreamListener+s for each # event. class PullEvent # The type of this event. Will be one of :tag_start, :tag_end, :text, # :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl, # :notationdecl, :entity, :cdata, :xmldecl, or :error. def initialize(arg) @contents = arg end def []( index ) @contents[index+1] end def event_type @contents[0] end # Content: [ String tag_name, Hash attributes ] def start_element? @contents[0] == :start_element end # Content: [ String tag_name ] def end_element? @contents[0] == :end_element end # Content: [ String raw_text, String unnormalized_text ] def text? @contents[0] == :text end # Content: [ String text ] def instruction? @contents[0] == :processing_instruction end # Content: [ String text ] def comment? @contents[0] == :comment end # Content: [ String name, String pub_sys, String long_name, String uri ] def doctype? @contents[0] == :start_doctype end # Content: [ String text ] def attlistdecl? @contents[0] == :attlistdecl end # Content: [ String text ] def elementdecl? @contents[0] == :elementdecl end # Due to the wonders of DTDs, an entity declaration can be just about # anything. There's no way to normalize it; you'll have to interpret the # content yourself. However, the following is true: # # * If the entity declaration is an internal entity: # [ String name, String value ] # Content: [ String text ] def entitydecl? @contents[0] == :entitydecl end # Content: [ String text ] def notationdecl? @contents[0] == :notationdecl end # Content: [ String text ] def entity? @contents[0] == :entity end # Content: [ String text ] def cdata? @contents[0] == :cdata end # Content: [ String version, String encoding, String standalone ] def xmldecl? @contents[0] == :xmldecl end def error? @contents[0] == :error end def inspect @contents[0].to_s + ": " + @contents[1..-1].inspect end end end end