summaryrefslogtreecommitdiff
path: root/trunk/lib/rss/parser.rb
diff options
context:
space:
mode:
authoryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-25 15:02:05 +0000
committeryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-25 15:02:05 +0000
commit0dc342de848a642ecce8db697b8fecd83a63e117 (patch)
tree2b7ed4724aff1f86073e4740134bda9c4aac1a39 /trunk/lib/rss/parser.rb
parentef70cf7138ab8034b5b806f466e4b484b24f0f88 (diff)
added tag v1_9_0_4
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/tags/v1_9_0_4@18845 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'trunk/lib/rss/parser.rb')
-rw-r--r--trunk/lib/rss/parser.rb551
1 files changed, 551 insertions, 0 deletions
diff --git a/trunk/lib/rss/parser.rb b/trunk/lib/rss/parser.rb
new file mode 100644
index 0000000000..9b28f0fa8a
--- /dev/null
+++ b/trunk/lib/rss/parser.rb
@@ -0,0 +1,551 @@
+require "forwardable"
+require "open-uri"
+
+require "rss/rss"
+require "rss/xml"
+
+module RSS
+
+ class NotWellFormedError < Error
+ attr_reader :line, :element
+
+ # Create a new NotWellFormedError for an error at +line+
+ # in +element+. If a block is given the return value of
+ # the block ends up in the error message.
+ def initialize(line=nil, element=nil)
+ message = "This is not well formed XML"
+ if element or line
+ message << "\nerror occurred"
+ message << " in #{element}" if element
+ message << " at about #{line} line" if line
+ end
+ message << "\n#{yield}" if block_given?
+ super(message)
+ end
+ end
+
+ class XMLParserNotFound < Error
+ def initialize
+ super("available XML parser was not found in " <<
+ "#{AVAILABLE_PARSER_LIBRARIES.inspect}.")
+ end
+ end
+
+ class NotValidXMLParser < Error
+ def initialize(parser)
+ super("#{parser} is not an available XML parser. " <<
+ "Available XML parser" <<
+ (AVAILABLE_PARSERS.size > 1 ? "s are " : " is ") <<
+ "#{AVAILABLE_PARSERS.inspect}.")
+ end
+ end
+
+ class NSError < InvalidRSSError
+ attr_reader :tag, :prefix, :uri
+ def initialize(tag, prefix, require_uri)
+ @tag, @prefix, @uri = tag, prefix, require_uri
+ super("prefix <#{prefix}> doesn't associate uri " <<
+ "<#{require_uri}> in tag <#{tag}>")
+ end
+ end
+
+ class Parser
+
+ extend Forwardable
+
+ class << self
+
+ @@default_parser = nil
+
+ def default_parser
+ @@default_parser || AVAILABLE_PARSERS.first
+ end
+
+ # Set @@default_parser to new_value if it is one of the
+ # available parsers. Else raise NotValidXMLParser error.
+ def default_parser=(new_value)
+ if AVAILABLE_PARSERS.include?(new_value)
+ @@default_parser = new_value
+ else
+ raise NotValidXMLParser.new(new_value)
+ end
+ end
+
+ def parse(rss, do_validate=true, ignore_unknown_element=true,
+ parser_class=default_parser)
+ parser = new(rss, parser_class)
+ parser.do_validate = do_validate
+ parser.ignore_unknown_element = ignore_unknown_element
+ parser.parse
+ end
+ end
+
+ def_delegators(:@parser, :parse, :rss,
+ :ignore_unknown_element,
+ :ignore_unknown_element=, :do_validate,
+ :do_validate=)
+
+ def initialize(rss, parser_class=self.class.default_parser)
+ @parser = parser_class.new(normalize_rss(rss))
+ end
+
+ private
+
+ # Try to get the XML associated with +rss+.
+ # Return +rss+ if it already looks like XML, or treat it as a URI,
+ # or a file to get the XML,
+ def normalize_rss(rss)
+ return rss if maybe_xml?(rss)
+
+ uri = to_uri(rss)
+
+ if uri.respond_to?(:read)
+ uri.read
+ elsif !rss.tainted? and File.readable?(rss)
+ File.open(rss) {|f| f.read}
+ else
+ rss
+ end
+ end
+
+ # maybe_xml? tests if source is a string that looks like XML.
+ def maybe_xml?(source)
+ source.is_a?(String) and /</ =~ source
+ end
+
+ # Attempt to convert rss to a URI, but just return it if
+ # there's a ::URI::Error
+ def to_uri(rss)
+ return rss if rss.is_a?(::URI::Generic)
+
+ begin
+ ::URI.parse(rss)
+ rescue ::URI::Error
+ rss
+ end
+ end
+ end
+
+ class BaseParser
+
+ class << self
+ def raise_for_undefined_entity?
+ listener.raise_for_undefined_entity?
+ end
+ end
+
+ def initialize(rss)
+ @listener = self.class.listener.new
+ @rss = rss
+ end
+
+ def rss
+ @listener.rss
+ end
+
+ def ignore_unknown_element
+ @listener.ignore_unknown_element
+ end
+
+ def ignore_unknown_element=(new_value)
+ @listener.ignore_unknown_element = new_value
+ end
+
+ def do_validate
+ @listener.do_validate
+ end
+
+ def do_validate=(new_value)
+ @listener.do_validate = new_value
+ end
+
+ def parse
+ if @listener.rss.nil?
+ _parse
+ end
+ @listener.rss
+ end
+
+ end
+
+ class BaseListener
+
+ extend Utils
+
+ class << self
+
+ @@accessor_bases = {}
+ @@registered_uris = {}
+ @@class_names = {}
+
+ # return the setter for the uri, tag_name pair, or nil.
+ def setter(uri, tag_name)
+ _getter = getter(uri, tag_name)
+ if _getter
+ "#{_getter}="
+ else
+ nil
+ end
+ end
+
+ def getter(uri, tag_name)
+ (@@accessor_bases[uri] || {})[tag_name]
+ end
+
+ # return the tag_names for setters associated with uri
+ def available_tags(uri)
+ (@@accessor_bases[uri] || {}).keys
+ end
+
+ # register uri against this name.
+ def register_uri(uri, name)
+ @@registered_uris[name] ||= {}
+ @@registered_uris[name][uri] = nil
+ end
+
+ # test if this uri is registered against this name
+ def uri_registered?(uri, name)
+ @@registered_uris[name].has_key?(uri)
+ end
+
+ # record class_name for the supplied uri and tag_name
+ def install_class_name(uri, tag_name, class_name)
+ @@class_names[uri] ||= {}
+ @@class_names[uri][tag_name] = class_name
+ end
+
+ # retrieve class_name for the supplied uri and tag_name
+ # If it doesn't exist, capitalize the tag_name
+ def class_name(uri, tag_name)
+ name = (@@class_names[uri] || {})[tag_name]
+ return name if name
+
+ tag_name = tag_name.gsub(/[_\-]([a-z]?)/) {$1.upcase}
+ tag_name[0, 1].upcase + tag_name[1..-1]
+ end
+
+ def install_get_text_element(uri, name, accessor_base)
+ install_accessor_base(uri, name, accessor_base)
+ def_get_text_element(uri, name, *get_file_and_line_from_caller(1))
+ end
+
+ def raise_for_undefined_entity?
+ true
+ end
+
+ private
+ # set the accessor for the uri, tag_name pair
+ def install_accessor_base(uri, tag_name, accessor_base)
+ @@accessor_bases[uri] ||= {}
+ @@accessor_bases[uri][tag_name] = accessor_base.chomp("=")
+ end
+
+ def def_get_text_element(uri, element_name, file, line)
+ register_uri(uri, element_name)
+ method_name = "start_#{element_name}"
+ unless private_method_defined?(method_name)
+ define_method(method_name) do |name, prefix, attrs, ns|
+ uri = _ns(ns, prefix)
+ if self.class.uri_registered?(uri, element_name)
+ start_get_text_element(name, prefix, ns, uri)
+ else
+ start_else_element(name, prefix, attrs, ns)
+ end
+ end
+ private(method_name)
+ end
+ end
+ end
+ end
+
+ module ListenerMixin
+ attr_reader :rss
+
+ attr_accessor :ignore_unknown_element
+ attr_accessor :do_validate
+
+ def initialize
+ @rss = nil
+ @ignore_unknown_element = true
+ @do_validate = true
+ @ns_stack = [{"xml" => :xml}]
+ @tag_stack = [[]]
+ @text_stack = ['']
+ @proc_stack = []
+ @last_element = nil
+ @version = @encoding = @standalone = nil
+ @xml_stylesheets = []
+ @xml_child_mode = false
+ @xml_element = nil
+ @last_xml_element = nil
+ end
+
+ # set instance vars for version, encoding, standalone
+ def xmldecl(version, encoding, standalone)
+ @version, @encoding, @standalone = version, encoding, standalone
+ end
+
+ def instruction(name, content)
+ if name == "xml-stylesheet"
+ params = parse_pi_content(content)
+ if params.has_key?("href")
+ @xml_stylesheets << XMLStyleSheet.new(params)
+ end
+ end
+ end
+
+ def tag_start(name, attributes)
+ @text_stack.push('')
+
+ ns = @ns_stack.last.dup
+ attrs = {}
+ attributes.each do |n, v|
+ if /\Axmlns(?:\z|:)/ =~ n
+ ns[$POSTMATCH] = v
+ else
+ attrs[n] = v
+ end
+ end
+ @ns_stack.push(ns)
+
+ prefix, local = split_name(name)
+ @tag_stack.last.push([_ns(ns, prefix), local])
+ @tag_stack.push([])
+ if @xml_child_mode
+ previous = @last_xml_element
+ element_attrs = attributes.dup
+ unless previous
+ ns.each do |ns_prefix, value|
+ next if ns_prefix == "xml"
+ key = ns_prefix.empty? ? "xmlns" : "xmlns:#{ns_prefix}"
+ element_attrs[key] ||= value
+ end
+ end
+ next_element = XML::Element.new(local,
+ prefix.empty? ? nil : prefix,
+ _ns(ns, prefix),
+ element_attrs)
+ previous << next_element if previous
+ @last_xml_element = next_element
+ pr = Proc.new do |text, tags|
+ if previous
+ @last_xml_element = previous
+ else
+ @xml_element = @last_xml_element
+ @last_xml_element = nil
+ end
+ end
+ @proc_stack.push(pr)
+ else
+ if @rss.nil? and respond_to?("initial_start_#{local}", true)
+ __send__("initial_start_#{local}", local, prefix, attrs, ns.dup)
+ elsif respond_to?("start_#{local}", true)
+ __send__("start_#{local}", local, prefix, attrs, ns.dup)
+ else
+ start_else_element(local, prefix, attrs, ns.dup)
+ end
+ end
+ end
+
+ def tag_end(name)
+ if DEBUG
+ p "end tag #{name}"
+ p @tag_stack
+ end
+ text = @text_stack.pop
+ tags = @tag_stack.pop
+ pr = @proc_stack.pop
+ pr.call(text, tags) unless pr.nil?
+ @ns_stack.pop
+ end
+
+ def text(data)
+ if @xml_child_mode
+ @last_xml_element << data if @last_xml_element
+ else
+ @text_stack.last << data
+ end
+ end
+
+ private
+ def _ns(ns, prefix)
+ ns.fetch(prefix, "")
+ end
+
+ CONTENT_PATTERN = /\s*([^=]+)=(["'])([^\2]+?)\2/
+ # Extract the first name="value" pair from content.
+ # Works with single quotes according to the constant
+ # CONTENT_PATTERN. Return a Hash.
+ def parse_pi_content(content)
+ params = {}
+ content.scan(CONTENT_PATTERN) do |name, quote, value|
+ params[name] = value
+ end
+ params
+ end
+
+ def start_else_element(local, prefix, attrs, ns)
+ class_name = self.class.class_name(_ns(ns, prefix), local)
+ current_class = @last_element.class
+ if known_class?(current_class, class_name)
+ next_class = current_class.const_get(class_name)
+ start_have_something_element(local, prefix, attrs, ns, next_class)
+ else
+ if !@do_validate or @ignore_unknown_element
+ @proc_stack.push(nil)
+ else
+ parent = "ROOT ELEMENT???"
+ if current_class.tag_name
+ parent = current_class.tag_name
+ end
+ raise NotExpectedTagError.new(local, _ns(ns, prefix), parent)
+ end
+ end
+ end
+
+ if Module.method(:const_defined?).arity == -1
+ def known_class?(target_class, class_name)
+ class_name and
+ (target_class.const_defined?(class_name, false) or
+ target_class.constants.include?(class_name.to_sym))
+ end
+ else
+ def known_class?(target_class, class_name)
+ class_name and
+ (target_class.const_defined?(class_name) or
+ target_class.constants.include?(class_name))
+ end
+ end
+
+ NAMESPLIT = /^(?:([\w:][-\w\d.]*):)?([\w:][-\w\d.]*)/
+ def split_name(name)
+ name =~ NAMESPLIT
+ [$1 || '', $2]
+ end
+
+ def check_ns(tag_name, prefix, ns, require_uri)
+ unless _ns(ns, prefix) == require_uri
+ if @do_validate
+ raise NSError.new(tag_name, prefix, require_uri)
+ else
+ # Force bind required URI with prefix
+ @ns_stack.last[prefix] = require_uri
+ end
+ end
+ end
+
+ def start_get_text_element(tag_name, prefix, ns, required_uri)
+ pr = Proc.new do |text, tags|
+ setter = self.class.setter(required_uri, tag_name)
+ if @last_element.respond_to?(setter)
+ if @do_validate
+ getter = self.class.getter(required_uri, tag_name)
+ if @last_element.__send__(getter)
+ raise TooMuchTagError.new(tag_name, @last_element.tag_name)
+ end
+ end
+ @last_element.__send__(setter, text.to_s)
+ else
+ if @do_validate and !@ignore_unknown_element
+ raise NotExpectedTagError.new(tag_name, _ns(ns, prefix),
+ @last_element.tag_name)
+ end
+ end
+ end
+ @proc_stack.push(pr)
+ end
+
+ def start_have_something_element(tag_name, prefix, attrs, ns, klass)
+ check_ns(tag_name, prefix, ns, klass.required_uri)
+ attributes = collect_attributes(tag_name, prefix, attrs, ns, klass)
+ @proc_stack.push(setup_next_element(tag_name, klass, attributes))
+ end
+
+ def collect_attributes(tag_name, prefix, attrs, ns, klass)
+ attributes = {}
+ klass.get_attributes.each do |a_name, a_uri, required, element_name|
+ if a_uri.is_a?(String) or !a_uri.respond_to?(:include?)
+ a_uri = [a_uri]
+ end
+ unless a_uri == [""]
+ for prefix, uri in ns
+ if a_uri.include?(uri)
+ val = attrs["#{prefix}:#{a_name}"]
+ break if val
+ end
+ end
+ end
+ if val.nil? and a_uri.include?("")
+ val = attrs[a_name]
+ end
+
+ if @do_validate and required and val.nil?
+ unless a_uri.include?("")
+ for prefix, uri in ns
+ if a_uri.include?(uri)
+ a_name = "#{prefix}:#{a_name}"
+ end
+ end
+ end
+ raise MissingAttributeError.new(tag_name, a_name)
+ end
+
+ attributes[a_name] = val
+ end
+ attributes
+ end
+
+ def setup_next_element(tag_name, klass, attributes)
+ previous = @last_element
+ next_element = klass.new(@do_validate, attributes)
+ previous.set_next_element(tag_name, next_element)
+ @last_element = next_element
+ @last_element.parent = previous if klass.need_parent?
+ @xml_child_mode = @last_element.have_xml_content?
+
+ Proc.new do |text, tags|
+ p(@last_element.class) if DEBUG
+ if @xml_child_mode
+ @last_element.content = @xml_element.to_s
+ xml_setter = @last_element.class.xml_setter
+ @last_element.__send__(xml_setter, @xml_element)
+ @xml_element = nil
+ @xml_child_mode = false
+ else
+ if klass.have_content?
+ if @last_element.need_base64_encode?
+ text = text.lstrip.unpack("m").first
+ end
+ @last_element.content = text
+ end
+ end
+ if @do_validate
+ @last_element.validate_for_stream(tags, @ignore_unknown_element)
+ end
+ @last_element = previous
+ end
+ end
+ end
+
+ unless const_defined? :AVAILABLE_PARSER_LIBRARIES
+ AVAILABLE_PARSER_LIBRARIES = [
+ ["rss/xmlparser", :XMLParserParser],
+ ["rss/xmlscanner", :XMLScanParser],
+ ["rss/rexmlparser", :REXMLParser],
+ ]
+ end
+
+ AVAILABLE_PARSERS = []
+
+ AVAILABLE_PARSER_LIBRARIES.each do |lib, parser|
+ begin
+ require lib
+ AVAILABLE_PARSERS.push(const_get(parser))
+ rescue LoadError
+ end
+ end
+
+ if AVAILABLE_PARSERS.empty?
+ raise XMLParserNotFound
+ end
+end