summaryrefslogtreecommitdiff
path: root/lib/rexml
diff options
context:
space:
mode:
authorser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-01-20 04:31:57 +0000
committerser <ser@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-01-20 04:31:57 +0000
commit66aeb2f7080dea92703f10546fb3cbcc946f6fa3 (patch)
tree4dfe958e610386eb752d60ad26a8f1702e1eb650 /lib/rexml
parent00190701e0b8cc9ce5dbe9c836e9584d1c1caeb8 (diff)
r1479@bean: ser | 2008-01-19 14:26:31 -0500
r1483@bean: ser | 2008-01-19 14:47:23 -0500 Sam's fixes: * Don't blow up on empty documents * Add a test case for sorted attributes * Making the output predictable simplifies unit tests, and doesn't cost much given that most xml element have few attributes * Ruby 1.9 revision 14922 is more strict * Complete Ticket #134 * Fix for ticket #121 * Fix for ticket #124 * Fix for ticket #128 * Fix ticket #133 * Ticket #131 (Support Ruby 1.9) * Fix for ticket #127 * Fix for ticket #123 * Add missing data needed by test case r1481@bean (orig r1303): ser | 2008-01-19 17:22:32 -0500 Tagged for release r1482@bean (orig r1304): ser | 2008-01-19 17:27:10 -0500 Version bump git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15141 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'lib/rexml')
-rw-r--r--lib/rexml/attribute.rb23
-rw-r--r--lib/rexml/cdata.rb2
-rw-r--r--lib/rexml/element.rb23
-rw-r--r--lib/rexml/formatters/default.rb2
-rw-r--r--lib/rexml/parsers/baseparser.rb38
-rw-r--r--lib/rexml/rexml.rb10
-rw-r--r--lib/rexml/source.rb6
-rw-r--r--lib/rexml/text.rb110
8 files changed, 163 insertions, 51 deletions
diff --git a/lib/rexml/attribute.rb b/lib/rexml/attribute.rb
index 89c1ada36c..17ced44c45 100644
--- a/lib/rexml/attribute.rb
+++ b/lib/rexml/attribute.rb
@@ -17,6 +17,8 @@ module REXML
attr_writer :normalized
PATTERN = /\s*(#{NAME_STR})\s*=\s*(["'])(.*?)\2/um
+ NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
+
# Constructor.
# FIXME: The parser doesn't catch illegal characters in attributes
#
@@ -110,15 +112,16 @@ module REXML
end
end
- # Returns the attribute value, with entities replaced
- def to_s
- return @normalized if @normalized
-
- doctype = nil
+ def doctype
if @element
doc = @element.document
doctype = doc.doctype if doc
end
+ end
+
+ # Returns the attribute value, with entities replaced
+ def to_s
+ return @normalized if @normalized
@normalized = Text::normalize( @unnormalized, doctype )
@unnormalized = nil
@@ -129,11 +132,6 @@ module REXML
# have been expanded to their values
def value
return @unnormalized if @unnormalized
- doctype = nil
- if @element
- doc = @element.document
- doctype = doc.doctype if doc
- end
@unnormalized = Text::unnormalize( @normalized, doctype )
@normalized = nil
@unnormalized
@@ -150,6 +148,11 @@ module REXML
# Returns this attribute
def element=( element )
@element = element
+
+ if @normalized
+ Text.check( @normalized, NEEDS_A_SECOND_CHECK, doctype )
+ end
+
self
end
diff --git a/lib/rexml/cdata.rb b/lib/rexml/cdata.rb
index efcb71160a..856b9ef8b2 100644
--- a/lib/rexml/cdata.rb
+++ b/lib/rexml/cdata.rb
@@ -13,7 +13,7 @@ module REXML
# CData.new( "Here is some CDATA" )
# CData.new( "Some unprocessed data", respect_whitespace_TF, parent_element )
def initialize( first, whitespace=true, parent=nil )
- super( first, whitespace, parent, true, true, ILLEGAL )
+ super( first, whitespace, parent, false, true, ILLEGAL )
end
# Make a copy of this object
diff --git a/lib/rexml/element.rb b/lib/rexml/element.rb
index 55094111e6..ecd10de965 100644
--- a/lib/rexml/element.rb
+++ b/lib/rexml/element.rb
@@ -558,7 +558,19 @@ module REXML
prefix = namespaces.index(namespace) if namespace
end
prefix = nil if prefix == 'xmlns'
- attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" )
+
+ ret_val =
+ attributes.get_attribute( "#{prefix ? prefix + ':' : ''}#{name}" )
+
+ return ret_val unless ret_val.nil?
+ return nil if prefix.nil?
+
+ # now check that prefix'es namespace is not the same as the
+ # default namespace
+ return nil unless ( namespaces[ prefix ] == namespaces[ 'xmlns' ] )
+
+ attributes.get_attribute( name )
+
end
# Evaluates to +true+ if this element has any attributes set, false
@@ -675,7 +687,7 @@ module REXML
# out = ''
# doc.write( out ) #-> doc is written to the string 'out'
# doc.write( $stdout ) #-> doc written to the console
- def write(writer=$stdout, indent=-1, transitive=false, ie_hack=false)
+ def write(output=$stdout, indent=-1, transitive=false, ie_hack=false)
Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters")
formatter = if indent > -1
if transitive
@@ -1217,14 +1229,17 @@ module REXML
#
# Method contributed by Henrik Martensson
def get_attribute_ns(namespace, name)
+ result = nil
each_attribute() { |attribute|
if name == attribute.name &&
namespace == attribute.namespace() &&
( !namespace.empty? || !attribute.fully_expanded_name.index(':') )
- return attribute
+ # foo will match xmlns:foo, but only if foo isn't also an attribute
+ result = attribute if !result or !namespace.empty? or
+ !attribute.fully_expanded_name.index(':')
end
}
- nil
+ result
end
end
end
diff --git a/lib/rexml/formatters/default.rb b/lib/rexml/formatters/default.rb
index 77381bdf84..b4d63bc5b5 100644
--- a/lib/rexml/formatters/default.rb
+++ b/lib/rexml/formatters/default.rb
@@ -63,7 +63,7 @@ module REXML
def write_element( node, output )
output << "<#{node.expanded_name}"
- node.attributes.each_attribute do |attr|
+ node.attributes.to_a.sort_by {|attr| attr.name}.each do |attr|
output << " "
attr.write( output )
end unless node.attributes.empty?
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 854e707fae..85f2c4e46d 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -25,7 +25,20 @@ module REXML
#
# Nat Price gave me some good ideas for the API.
class BaseParser
- NCNAME_STR= '[\w:][\-\w\d.]*'
+ if String.method_defined? :encode
+ # Oniguruma / POSIX [understands unicode]
+ LETTER = '[[:alpha:]]'
+ DIGIT = '[[:digit:]]'
+ else
+ # Ruby < 1.9 [doesn't understand unicode]
+ LETTER = 'a-zA-Z'
+ DIGIT = '\d'
+ end
+
+ COMBININGCHAR = '' # TODO
+ EXTENDER = '' # TODO
+
+ NCNAME_STR= "[#{LETTER}_:][-#{LETTER}#{DIGIT}._:#{COMBININGCHAR}#{EXTENDER}]*"
NAME_STR= "(?:(#{NCNAME_STR}):)?(#{NCNAME_STR})"
UNAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
@@ -33,7 +46,7 @@ module REXML
NAME = "([\\w:]#{NAMECHAR}*)"
NMTOKEN = "(?:#{NAMECHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
- REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
+ REFERENCE = "&(?:#{NAME};|#\\d+;|#x[0-9a-fA-F]+;)"
REFERENCE_RE = /#{REFERENCE}/
DOCTYPE_START = /\A\s*<!DOCTYPE\s/um
@@ -340,6 +353,12 @@ module REXML
raise REXML::ParseException.new("Malformed node", @source) unless md
if md[0][2] == ?-
md = @source.match( COMMENT_PATTERN, true )
+
+ case md[1]
+ when /--/, /-$/
+ raise REXML::ParseException.new("Malformed comment", @source)
+ end
+
return [ :comment, md[1] ] if md
else
md = @source.match( CDATA_PATTERN, true )
@@ -384,6 +403,12 @@ module REXML
elsif b
prefixes << b unless b == "xml"
end
+
+ if attributes.has_key? a
+ msg = "Duplicate attribute #{a.inspect}"
+ raise REXML::ParseException.new( msg, @source, self)
+ end
+
attributes[a] = e
}
end
@@ -470,15 +495,12 @@ module REXML
if entity_value
re = /&#{entity_reference};/
rv.gsub!( re, entity_value )
+ else
+ er = DEFAULT_ENTITIES[entity_reference]
+ rv.gsub!( er[0], er[2] ) if er
end
end
end
- matches.each do |entity_reference|
- unless filter and filter.include?(entity_reference)
- er = DEFAULT_ENTITIES[entity_reference]
- rv.gsub!( er[0], er[2] ) if er
- end
- end
rv.gsub!( /&amp;/, '&' )
end
rv
diff --git a/lib/rexml/rexml.rb b/lib/rexml/rexml.rb
index 435242dc31..810af31356 100644
--- a/lib/rexml/rexml.rb
+++ b/lib/rexml/rexml.rb
@@ -11,8 +11,8 @@
#
# Main page:: http://www.germane-software.com/software/rexml
# Author:: Sean Russell <serATgermaneHYPHENsoftwareDOTcom>
-# Version:: 3.1.7.2
-# Date:: 2007/275
+# Date:: 2008/019
+# Version:: 3.1.7.3
#
# This API documentation can be downloaded from the REXML home page, or can
# be accessed online[http://www.germane-software.com/software/rexml_doc]
@@ -21,9 +21,9 @@
# or can be accessed
# online[http://www.germane-software.com/software/rexml/docs/tutorial.html]
module REXML
- COPYRIGHT = "Copyright © 2001-2007 Sean Russell <ser@germane-software.com>"
- DATE = "2007/275"
- VERSION = "3.1.7.2"
+ COPYRIGHT = "Copyright © 2001-2008 Sean Russell <ser@germane-software.com>"
+ DATE = "2008/019"
+ VERSION = "3.1.7.3"
REVISION = "$Revision$".gsub(/\$Revision:|\$/,'').strip
Copyright = COPYRIGHT
diff --git a/lib/rexml/source.rb b/lib/rexml/source.rb
index 3f14239a35..d4335138a1 100644
--- a/lib/rexml/source.rb
+++ b/lib/rexml/source.rb
@@ -147,7 +147,7 @@ module REXML
# the XML spec. If there is one, we can determine the encoding from
# it.
@buffer = ""
- str = @source.read( 2 )
+ str = @source.read( 2 ) || ''
if encoding
self.encoding = encoding
elsif str[0,2] == "\xfe\xff"
@@ -161,7 +161,7 @@ module REXML
else
@line_break = ">"
end
- super str+@source.readline( @line_break )
+ super( @source.eof? ? str : str+@source.readline( @line_break ) )
end
def scan(pattern, cons=false)
@@ -231,7 +231,7 @@ module REXML
end
def position
- @er_source.stat.pipe? ? 0 : @er_source.pos
+ @er_source.pos rescue 0
end
# @return the current line in the source
diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
index 8058157605..c23cd17c02 100644
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@@ -18,8 +18,40 @@ module REXML
# If +raw+ is true, then REXML leaves the value alone
attr_accessor :raw
- ILLEGAL = /(<|&(?!(#{Entity::NAME})|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));))/um
+ NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um
NUMERICENTITY = /&#0*((?:\d+)|(?:x[a-fA-F0-9]+));/
+ VALID_CHAR = [
+ 0x9, 0xA, 0xD,
+ (0x20..0xD7FF),
+ (0xE000..0xFFFD),
+ (0x10000..0x10FFFF)
+ ]
+
+ if String.method_defined? :encode
+ VALID_XML_CHARS = Regexp.new('^['+
+ VALID_CHAR.map { |item|
+ case item
+ when Fixnum
+ [item].pack('U').force_encoding('utf-8')
+ when Range
+ [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8')
+ end
+ }.join +
+ ']*$')
+ else
+ VALID_XML_CHARS = /^(
+ [\x09\x0A\x0D\x20-\x7E] # ASCII
+ | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte
+ | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs
+ | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte
+ | \xEF[\x80-\xBE]{2} #
+ | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff
+ | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates
+ | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3
+ | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15
+ | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16
+ )*$/x;
+ end
# Constructor
# +arg+ if a String, the content is set to the String. If a Text,
@@ -58,7 +90,7 @@ module REXML
#
# +pattern+ INTERNAL USE ONLY
def initialize(arg, respect_whitespace=false, parent=nil, raw=nil,
- entity_filter=nil, illegal=ILLEGAL )
+ entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK )
@raw = false
@@ -85,10 +117,54 @@ module REXML
@string.gsub!( /\r\n?/, "\n" )
- # check for illegal characters
- if @raw
- if @string =~ illegal
- raise "Illegal character '#{$1}' in raw string \"#{@string}\""
+ Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+ end
+
+ def parent= parent
+ super(parent)
+ Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent
+ end
+
+ # check for illegal characters
+ def Text.check string, pattern, doctype
+
+ # illegal anywhere
+ if string !~ VALID_XML_CHARS
+ if String.method_defined? :encode
+ string.chars.each do |c|
+ case c.ord
+ when *VALID_CHAR
+ else
+ raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+ end
+ end
+ else
+ string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/) do |c|
+ case c.unpack('U')
+ when *VALID_CHAR
+ else
+ raise "Illegal character #{c.inspect} in raw string \"#{string}\""
+ end
+ end
+ end
+ end
+
+ # context sensitive
+ string.scan(pattern).each do
+ if $1[-1] != ?;
+ raise "Illegal character '#{$1}' in raw string \"#{string}\""
+ elsif $1[0] == ?&
+ if $5 and $5[0] == ?#
+ case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i)
+ when *VALID_CHAR
+ else
+ raise "Illegal character '#{$1}' in raw string \"#{string}\""
+ end
+ elsif $3 and !SUBSTITUTES.include?($1)
+ if !doctype or !doctype.entities.has_key?($3)
+ raise "Undeclared entity '#{$1}' in raw string \"#{string}\""
+ end
+ end
end
end
end
@@ -120,6 +196,13 @@ module REXML
to_s() <=> other.to_s
end
+ def doctype
+ if @parent
+ doc = @parent.document
+ doc.doctype if doc
+ end
+ end
+
REFERENCE = /#{Entity::REFERENCE}/
# Returns the string value of this text node. This string is always
# escaped, meaning that it is a valid XML text node string, and all
@@ -138,12 +221,6 @@ module REXML
return @string if @raw
return @normalized if @normalized
- doctype = nil
- if @parent
- doc = @parent.document
- doctype = doc.doctype if doc
- end
-
@normalized = Text::normalize( @string, doctype, @entity_filter )
end
@@ -165,12 +242,7 @@ module REXML
# u = Text.new( "sean russell", false, nil, true )
# u.value #-> "sean russell"
def value
- @unnormalized if @unnormalized
- doctype = nil
- if @parent
- doc = @parent.document
- doctype = doc.doctype if doc
- end
+ return @unnormalized if @unnormalized
@unnormalized = Text::unnormalize( @string, doctype )
end
@@ -286,7 +358,7 @@ module REXML
EREFERENCE = /&(?!#{Entity::NAME};)/
# Escapes all possible entities
def Text::normalize( input, doctype=nil, entity_filter=nil )
- copy = input
+ copy = input.to_s
# Doing it like this rather than in a loop improves the speed
#copy = copy.gsub( EREFERENCE, '&amp;' )
copy = copy.gsub( "&", "&amp;" )