diff options
Diffstat (limited to 'lib/rss/parser.rb')
| -rw-r--r-- | lib/rss/parser.rb | 227 |
1 files changed, 161 insertions, 66 deletions
diff --git a/lib/rss/parser.rb b/lib/rss/parser.rb index 033bc123aa..a9842e6d40 100644 --- a/lib/rss/parser.rb +++ b/lib/rss/parser.rb @@ -1,7 +1,9 @@ +# frozen_string_literal: false require "forwardable" require "open-uri" require "rss/rss" +require "rss/xml" module RSS @@ -33,8 +35,8 @@ module RSS class NotValidXMLParser < Error def initialize(parser) super("#{parser} is not an available XML parser. " << - "Available XML parser"<< - (AVAILABLE_PARSERS.size > 1 ? "s are ": " is ") << + "Available XML parser" << + (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") << "#{AVAILABLE_PARSERS.inspect}.") end end @@ -97,7 +99,7 @@ module RSS return rss if maybe_xml?(rss) uri = to_uri(rss) - + if uri.respond_to?(:read) uri.read elsif !rss.tainted? and File.readable?(rss) @@ -112,13 +114,13 @@ module RSS source.is_a?(String) and /</ =~ source end - # Attempt to convert rss to a URI, but just return it if + # Attempt to convert rss to a URI, but just return it if # there's a ::URI::Error def to_uri(rss) return rss if rss.is_a?(::URI::Generic) begin - URI(rss) + ::URI.parse(rss) rescue ::URI::Error rss end @@ -132,7 +134,7 @@ module RSS listener.raise_for_undefined_entity? end end - + def initialize(rss) @listener = self.class.listener.new @rss = rss @@ -173,35 +175,35 @@ module RSS class << self - @@setters = {} + @@accessor_bases = {} @@registered_uris = {} @@class_names = {} # return the setter for the uri, tag_name pair, or nil. def setter(uri, tag_name) - begin - @@setters[uri][tag_name] - rescue NameError + _getter = getter(uri, tag_name) + if _getter + "#{_getter}=" + else nil end end + def getter(uri, tag_name) + (@@accessor_bases[uri] || {})[tag_name] + end # return the tag_names for setters associated with uri def available_tags(uri) - begin - @@setters[uri].keys - rescue NameError - [] - end + (@@accessor_bases[uri] || {}).keys end - + # register uri against this name. def register_uri(uri, name) @@registered_uris[name] ||= {} @@registered_uris[name][uri] = nil end - + # test if this uri is registered against this name def uri_registered?(uri, name) @@registered_uris[name].has_key?(uri) @@ -216,52 +218,48 @@ module RSS # retrieve class_name for the supplied uri and tag_name # If it doesn't exist, capitalize the tag_name def class_name(uri, tag_name) - begin - @@class_names[uri][tag_name] - rescue NameError - tag_name[0,1].upcase + tag_name[1..-1] - end + name = (@@class_names[uri] || {})[tag_name] + return name if name + + tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase} + tag_name[0, 1].upcase + tag_name[1..-1] end - def install_get_text_element(uri, name, setter) - install_setter(uri, name, setter) + def install_get_text_element(uri, name, accessor_base) + install_accessor_base(uri, name, accessor_base) def_get_text_element(uri, name, *get_file_and_line_from_caller(1)) end - + def raise_for_undefined_entity? true end - + private - # set the setter for the uri, tag_name pair - def install_setter(uri, tag_name, setter) - @@setters[uri] ||= {} - @@setters[uri][tag_name] = setter + # set the accessor for the uri, tag_name pair + def install_accessor_base(uri, tag_name, accessor_base) + @@accessor_bases[uri] ||= {} + @@accessor_bases[uri][tag_name] = accessor_base.chomp("=") end - def def_get_text_element(uri, name, file, line) - register_uri(uri, name) - unless private_instance_methods(false).include?("start_#{name}") - module_eval(<<-EOT, file, line) - def start_#{name}(name, prefix, attrs, ns) + def def_get_text_element(uri, element_name, file, line) + register_uri(uri, element_name) + method_name = "start_#{element_name}" + unless private_method_defined?(method_name) + define_method(method_name) do |name, prefix, attrs, ns| uri = _ns(ns, prefix) - if self.class.uri_registered?(uri, #{name.inspect}) + if self.class.uri_registered?(uri, element_name) start_get_text_element(name, prefix, ns, uri) else start_else_element(name, prefix, attrs, ns) end end - EOT - __send__("private", "start_#{name}") + private(method_name) end end - end - end module ListenerMixin - attr_reader :rss attr_accessor :ignore_unknown_element @@ -271,15 +269,18 @@ module RSS @rss = nil @ignore_unknown_element = true @do_validate = true - @ns_stack = [{}] + @ns_stack = [{"xml" => :xml}] @tag_stack = [[]] @text_stack = [''] @proc_stack = [] @last_element = nil @version = @encoding = @standalone = nil @xml_stylesheets = [] + @xml_child_mode = false + @xml_element = nil + @last_xml_element = nil end - + # set instance vars for version, encoding, standalone def xmldecl(version, encoding, standalone) @version, @encoding, @standalone = version, encoding, standalone @@ -289,7 +290,7 @@ module RSS if name == "xml-stylesheet" params = parse_pi_content(content) if params.has_key?("href") - @xml_stylesheets << XMLStyleSheet.new(*params) + @xml_stylesheets << XMLStyleSheet.new(params) end end end @@ -311,10 +312,39 @@ module RSS prefix, local = split_name(name) @tag_stack.last.push([_ns(ns, prefix), local]) @tag_stack.push([]) - if respond_to?("start_#{local}", true) - __send__("start_#{local}", local, prefix, attrs, ns.dup) + if @xml_child_mode + previous = @last_xml_element + element_attrs = attributes.dup + unless previous + ns.each do |ns_prefix, value| + next if ns_prefix == "xml" + key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}" + element_attrs[key] ||= value + end + end + next_element = XML::Element.new(local, + prefix.empty? ? nil : prefix, + _ns(ns, prefix), + element_attrs) + previous << next_element if previous + @last_xml_element = next_element + pr = Proc.new do |text, tags| + if previous + @last_xml_element = previous + else + @xml_element = @last_xml_element + @last_xml_element = nil + end + end + @proc_stack.push(pr) else - start_else_element(local, prefix, attrs, ns.dup) + if @rss.nil? and respond_to?("initial_start_#{local}", true) + __send__("initial_start_#{local}", local, prefix, attrs, ns.dup) + elsif respond_to?("start_#{local}", true) + __send__("start_#{local}", local, prefix, attrs, ns.dup) + else + start_else_element(local, prefix, attrs, ns.dup) + end end end @@ -331,7 +361,11 @@ module RSS end def text(data) - @text_stack.last << data + if @xml_child_mode + @last_xml_element << data if @last_xml_element + else + @text_stack.last << data + end end private @@ -354,12 +388,12 @@ module RSS def start_else_element(local, prefix, attrs, ns) class_name = self.class.class_name(_ns(ns, prefix), local) current_class = @last_element.class - if current_class.constants.include?(class_name) + if known_class?(current_class, class_name) next_class = current_class.const_get(class_name) start_have_something_element(local, prefix, attrs, ns, next_class) else if !@do_validate or @ignore_unknown_element - @proc_stack.push(nil) + @proc_stack.push(setup_next_element_in_unknown_element) else parent = "ROOT ELEMENT???" if current_class.tag_name @@ -370,26 +404,56 @@ module RSS end end - NAMESPLIT = /^(?:([\w:][-\w\d.]*):)?([\w:][-\w\d.]*)/ + if Module.method(:const_defined?).arity == -1 + def known_class?(target_class, class_name) + class_name and + (target_class.const_defined?(class_name, false) or + target_class.constants.include?(class_name.to_sym)) + end + else + def known_class?(target_class, class_name) + class_name and + (target_class.const_defined?(class_name) or + target_class.constants.include?(class_name)) + end + end + + NAMESPLIT = /^(?:([\w:][-\w.]*):)?([\w:][-\w.]*)/ def split_name(name) name =~ NAMESPLIT [$1 || '', $2] end - def check_ns(tag_name, prefix, ns, require_uri) - if @do_validate - if _ns(ns, prefix) == require_uri - #ns.delete(prefix) - else + def check_ns(tag_name, prefix, ns, require_uri, ignore_unknown_element=nil) + if _ns(ns, prefix) == require_uri + true + else + if ignore_unknown_element.nil? + ignore_unknown_element = @ignore_unknown_element + end + + if ignore_unknown_element + false + elsif @do_validate raise NSError.new(tag_name, prefix, require_uri) + else + # Force bind required URI with prefix + @ns_stack.last[prefix] = require_uri + true end end end def start_get_text_element(tag_name, prefix, ns, required_uri) - @proc_stack.push Proc.new {|text, tags| + pr = Proc.new do |text, tags| setter = self.class.setter(required_uri, tag_name) - if @last_element.respond_to?(setter) + if setter and @last_element.respond_to?(setter) + if @do_validate + getter = self.class.getter(required_uri, tag_name) + if @last_element.__send__(getter) + raise TooMuchTagError.new(tag_name, @last_element.tag_name) + end + end @last_element.__send__(setter, text.to_s) else if @do_validate and !@ignore_unknown_element @@ -397,16 +461,22 @@ module RSS @last_element.tag_name) end end - } + end + @proc_stack.push(pr) end def start_have_something_element(tag_name, prefix, attrs, ns, klass) + if check_ns(tag_name, prefix, ns, klass.required_uri) + attributes = collect_attributes(tag_name, prefix, attrs, ns, klass) + @proc_stack.push(setup_next_element(tag_name, klass, attributes)) + else + @proc_stack.push(setup_next_element_in_unknown_element) + end + end - check_ns(tag_name, prefix, ns, klass.required_uri) - + def collect_attributes(tag_name, prefix, attrs, ns, klass) attributes = {} klass.get_attributes.each do |a_name, a_uri, required, element_name| - if a_uri.is_a?(String) or !a_uri.respond_to?(:include?) a_uri = [a_uri] end @@ -435,24 +505,48 @@ module RSS attributes[a_name] = val end + attributes + end + def setup_next_element(tag_name, klass, attributes) previous = @last_element next_element = klass.new(@do_validate, attributes) - previous.instance_eval {set_next_element(tag_name, next_element)} + previous.set_next_element(tag_name, next_element) @last_element = next_element - @proc_stack.push Proc.new { |text, tags| + @last_element.parent = previous if klass.need_parent? + @xml_child_mode = @last_element.have_xml_content? + + Proc.new do |text, tags| p(@last_element.class) if DEBUG - @last_element.content = text if klass.have_content? + if @xml_child_mode + @last_element.content = @xml_element.to_s + xml_setter = @last_element.class.xml_setter + @last_element.__send__(xml_setter, @xml_element) + @xml_element = nil + @xml_child_mode = false + else + if klass.have_content? + if @last_element.need_base64_encode? + text = text.lstrip.unpack("m").first + end + @last_element.content = text + end + end if @do_validate @last_element.validate_for_stream(tags, @ignore_unknown_element) end @last_element = previous - } + end end + def setup_next_element_in_unknown_element + current_element, @last_element = @last_element, nil + Proc.new {@last_element = current_element} + end end unless const_defined? :AVAILABLE_PARSER_LIBRARIES + # The list of all available libraries for parsing. AVAILABLE_PARSER_LIBRARIES = [ ["rss/xmlparser", :XMLParserParser], ["rss/xmlscanner", :XMLScanParser], @@ -460,6 +554,7 @@ module RSS ] end + # The list of all available parsers, in constant form. AVAILABLE_PARSERS = [] AVAILABLE_PARSER_LIBRARIES.each do |lib, parser| |
