summaryrefslogtreecommitdiff
path: root/lib/rexml/parsers/pullparser.rb
blob: 0a328ea8fcaa4092cb4e23b29217bcc48c9efe77 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
require 'rexml/parseexception'
require 'rexml/parsers/baseparser'
require 'rexml/xmltokens'

module REXML
	module Parsers
		# = Using the Pull Parser
		# <em>This API is experimental, and subject to change.</em>
		#  parser = PullParser.new( "<a>text<b att='val'/>txet</a>" )
		#  while parser.has_next?
		#    res = parser.next
		#    puts res[1]['att'] if res.start_tag? and res[0] == 'b'
		#  end
		# See the PullEvent class for information on the content of the results.
		# The data is identical to the arguments passed for the various events to
		# the StreamListener API.
		#
		# Notice that:
		#  parser = PullParser.new( "<a>BAD DOCUMENT" )
		#  while parser.has_next?
		#    res = parser.next
		#    raise res[1] if res.error?
		#  end
		#
		# Nat Price gave me some good ideas for the API.
		class PullParser
			include XMLTokens

			def initialize stream
				@entities = {}
        @listeners = nil
        @parser = BaseParser.new( stream )
			end

      def add_listener( listener )
        @listeners = [] unless @listeners
        @listeners << listener
      end

			def each
				while has_next?
					yield self.pull
				end
			end

			def peek depth=0
				PullEvent.new(@parser.peek(depth))
			end

      def has_next?
        @parser.has_next?
      end

			def pull
				event = @parser.pull
				case event[0]
				when :entitydecl
					@entities[ event[1] ] = 
						event[2] unless event[2] =~ /PUBLIC|SYSTEM/
				when :text
					unnormalized = @parser.unnormalize( event[1], @entities )
					event << unnormalized
				end
				PullEvent.new( event )
			end

      def unshift token
        @parser.unshift token
      end

      def entity reference
        @parser.entity( reference )
      end

      def empty?
        @parser.empty?
      end

		end

		# A parsing event.  The contents of the event are accessed as an +Array?,
		# and the type is given either by the ...? methods, or by accessing the
		# +type+ accessor.  The contents of this object vary from event to event,
		# but are identical to the arguments passed to +StreamListener+s for each
		# event.
		class PullEvent
			# The type of this event.  Will be one of :tag_start, :tag_end, :text,
			# :processing_instruction, :comment, :doctype, :attlistdecl, :entitydecl,
			# :notationdecl, :entity, :cdata, :xmldecl, or :error.
			def initialize(arg)
				@contents = arg
			end

      def []( start, endd=nil)
        if start.kind_of? Range
          @contents.slice( start.begin+1 .. start.end )
        elsif start.kind_of? Numeric
          if endd.nil?
            @contents.slice( start+1 )
          else
            @contents.slice( start+1, endd )
          end
        else
          raise "Illegal argument #{start.inspect} (#{start.class})"
        end
			end

			def event_type
				@contents[0]
			end

			# Content: [ String tag_name, Hash attributes ]
			def start_element?
				@contents[0] == :start_element
			end

			# Content: [ String tag_name ]
			def end_element?
				@contents[0] == :end_element
			end

			# Content: [ String raw_text, String unnormalized_text ]
			def text?
				@contents[0] == :text
			end

			# Content: [ String text ]
			def instruction?
				@contents[0] == :processing_instruction
			end

			# Content: [ String text ]
			def comment?
				@contents[0] == :comment
			end

			# Content: [ String name, String pub_sys, String long_name, String uri ]
			def doctype?
				@contents[0] == :start_doctype
			end

			# Content: [ String text ]
			def attlistdecl?
				@contents[0] == :attlistdecl
			end

			# Content: [ String text ]
			def elementdecl?
				@contents[0] == :elementdecl
			end

			# Due to the wonders of DTDs, an entity declaration can be just about
			# anything.  There's no way to normalize it; you'll have to interpret the
			# content yourself.  However, the following is true:
			#
			# * If the entity declaration is an internal entity:
			#   [ String name, String value ]
			# Content: [ String text ]
			def entitydecl?
				@contents[0] == :entitydecl
			end

			# Content: [ String text ]
			def notationdecl?
				@contents[0] == :notationdecl
			end

			# Content: [ String text ]
			def entity?
				@contents[0] == :entity
			end

			# Content: [ String text ]
			def cdata?
				@contents[0] == :cdata
			end

			# Content: [ String version, String encoding, String standalone ]
			def xmldecl?
				@contents[0] == :xmldecl
			end

			def error?
				@contents[0] == :error
			end

			def inspect
        @contents[0].to_s + ": " + @contents[1..-1].inspect
			end
		end
	end
end