diff options
Diffstat (limited to 'lib/rexml/parsers/baseparser.rb')
-rw-r--r-- | lib/rexml/parsers/baseparser.rb | 169 |
1 files changed, 111 insertions, 58 deletions
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 4df1f57a05..de337a74d5 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -3,6 +3,7 @@ require_relative '../parseexception' require_relative '../undefinednamespaceexception' require_relative '../source' require 'set' +require "strscan" module REXML module Parsers @@ -32,9 +33,9 @@ module REXML COMBININGCHAR = '' # TODO EXTENDER = '' # TODO - NCNAME_STR= "[#{LETTER}_:][-[:alnum:]._:#{COMBININGCHAR}#{EXTENDER}]*" - NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" - UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}" + NCNAME_STR= "[#{LETTER}_][-[:alnum:]._#{COMBININGCHAR}#{EXTENDER}]*" + QNAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})" + QNAME = /(#{QNAME_STR})/ NAMECHAR = '[\-\w\.:]' NAME = "([\\w:]#{NAMECHAR}*)" @@ -46,7 +47,7 @@ module REXML DOCTYPE_START = /\A\s*<!DOCTYPE\s/um DOCTYPE_END = /\A\s*\]\s*>/um DOCTYPE_PATTERN = /\s*<!DOCTYPE\s+(.*?)(\[|>)/um - ATTRIBUTE_PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\4/um + ATTRIBUTE_PATTERN = /\s*(#{QNAME_STR})\s*=\s*(["'])(.*?)\4/um COMMENT_START = /\A<!--/u COMMENT_PATTERN = /<!--(.*?)-->/um CDATA_START = /\A<!\[CDATA\[/u @@ -55,9 +56,9 @@ module REXML XMLDECL_START = /\A<\?xml\s/u; XMLDECL_PATTERN = /<\?xml\s+(.*?)\?>/um INSTRUCTION_START = /\A<\?/u - INSTRUCTION_PATTERN = /<\?(.*?)(\s+.*?)?\?>/um - TAG_MATCH = /^<((?>#{NAME_STR}))\s*((?>\s+#{UNAME_STR}\s*=\s*(["']).*?\5)*)\s*(\/)?>/um - CLOSE_MATCH = /^\s*<\/(#{NAME_STR})\s*>/um + INSTRUCTION_PATTERN = /<\?#{NAME}(\s+.*?)?\?>/um + TAG_MATCH = /^<((?>#{QNAME_STR}))/um + CLOSE_MATCH = /^\s*<\/(#{QNAME_STR})\s*>/um VERSION = /\bversion\s*=\s*["'](.*?)['"]/um ENCODING = /\bencoding\s*=\s*["'](.*?)['"]/um @@ -107,13 +108,6 @@ module REXML "apos" => [/'/, "'", "'", /'/] } - - ###################################################################### - # These are patterns to identify common markup errors, to make the - # error messages more informative. - ###################################################################### - MISSING_ATTRIBUTE_QUOTES = /^<#{NAME_STR}\s+#{NAME_STR}\s*=\s*[^"']/um - def initialize( source ) self.stream = source @listeners = [] @@ -224,7 +218,7 @@ module REXML standalone = standalone[1] unless standalone.nil? return [ :xmldecl, version, encoding, standalone ] when INSTRUCTION_START - return [ :processing_instruction, *@source.match(INSTRUCTION_PATTERN, true)[1,2] ] + return process_instruction when DOCTYPE_START md = @source.match( DOCTYPE_PATTERN, true ) @nsstack.unshift(curr_ns=Set.new) @@ -336,11 +330,12 @@ module REXML if @source.buffer[1] == ?/ @nsstack.shift last_tag = @tags.pop - #md = @source.match_to_consume( '>', CLOSE_MATCH) md = @source.match( CLOSE_MATCH, true ) - raise REXML::ParseException.new( "Missing end tag for "+ - "'#{last_tag}' (got \"#{md[1]}\")", - @source) unless last_tag == md[1] + if md.nil? or last_tag != md[1] + message = "Missing end tag for '#{last_tag}'" + message << " (got '#{md[1]}')" if md + raise REXML::ParseException.new(message, @source) + end return [ :end_element, last_tag ] elsif @source.buffer[1] == ?! md = @source.match(/\A(\s*[^>]*>)/um) @@ -362,52 +357,17 @@ module REXML raise REXML::ParseException.new( "Declarations can only occur "+ "in the doctype declaration.", @source) elsif @source.buffer[1] == ?? - md = @source.match( INSTRUCTION_PATTERN, true ) - return [ :processing_instruction, md[1], md[2] ] if md - raise REXML::ParseException.new( "Bad instruction declaration", - @source) + return process_instruction else # Get the next tag md = @source.match(TAG_MATCH, true) unless md - # Check for missing attribute quotes - raise REXML::ParseException.new("missing attribute quote", @source) if @source.match(MISSING_ATTRIBUTE_QUOTES ) raise REXML::ParseException.new("malformed XML: missing tag start", @source) end - attributes = {} prefixes = Set.new prefixes << md[2] if md[2] @nsstack.unshift(curr_ns=Set.new) - if md[4].size > 0 - attrs = md[4].scan( ATTRIBUTE_PATTERN ) - raise REXML::ParseException.new( "error parsing attributes: [#{attrs.join ', '}], excess = \"#$'\"", @source) if $' and $'.strip.size > 0 - attrs.each do |attr_name, prefix, local_part, quote, value| - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) - end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) - end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - - if attributes.has_key?(attr_name) - msg = "Duplicate attribute #{attr_name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end - - attributes[attr_name] = value - end - end - + attributes, closed = parse_attributes(prefixes, curr_ns) # Verify that all of the prefixes have been defined for prefix in prefixes unless @nsstack.find{|k| k.member?(prefix)} @@ -415,7 +375,7 @@ module REXML end end - if md[6] + if closed @closed = md[1] @nsstack.shift else @@ -438,7 +398,7 @@ module REXML raise rescue REXML::ParseException raise - rescue Exception, NameError => error + rescue => error raise REXML::ParseException.new( "Exception parsing", @source, self, (error ? error : $!) ) end @@ -508,6 +468,99 @@ module REXML return false if /\AUTF-16\z/i =~ xml_declaration_encoding true end + + def process_instruction + match_data = @source.match(INSTRUCTION_PATTERN, true) + unless match_data + message = "Invalid processing instruction node" + raise REXML::ParseException.new(message, @source) + end + [:processing_instruction, match_data[1], match_data[2]] + end + + def parse_attributes(prefixes, curr_ns) + attributes = {} + closed = false + match_data = @source.match(/^(.*?)(\/)?>/um, true) + if match_data.nil? + message = "Start tag isn't ended" + raise REXML::ParseException.new(message, @source) + end + + raw_attributes = match_data[1] + closed = !match_data[2].nil? + return attributes, closed if raw_attributes.nil? + return attributes, closed if raw_attributes.empty? + + scanner = StringScanner.new(raw_attributes) + until scanner.eos? + if scanner.scan(/\s+/) + break if scanner.eos? + end + + pos = scanner.pos + loop do + break if scanner.scan(ATTRIBUTE_PATTERN) + unless scanner.scan(QNAME) + message = "Invalid attribute name: <#{scanner.rest}>" + raise REXML::ParseException.new(message, @source) + end + name = scanner[0] + unless scanner.scan(/\s*=\s*/um) + message = "Missing attribute equal: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + quote = scanner.scan(/['"]/) + unless quote + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) + end + unless scanner.scan(/.*#{Regexp.escape(quote)}/um) + match_data = @source.match(/^(.*?)(\/)?>/um, true) + if match_data + scanner << "/" if closed + scanner << ">" + scanner << match_data[1] + scanner.pos = pos + closed = !match_data[2].nil? + next + end + message = + "Missing attribute value end quote: <#{name}>: <#{quote}>" + raise REXML::ParseException.new(message, @source) + end + end + name = scanner[1] + prefix = scanner[2] + local_part = scanner[3] + # quote = scanner[4] + value = scanner[5] + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self) + end + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" + end + + if attributes.has_key?(name) + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end + + attributes[name] = value + end + return attributes, closed + end end end end |