From 0dc342de848a642ecce8db697b8fecd83a63e117 Mon Sep 17 00:00:00 2001 From: yugui Date: Mon, 25 Aug 2008 15:02:05 +0000 Subject: added tag v1_9_0_4 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/tags/v1_9_0_4@18845 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- trunk/lib/rexml/text.rb | 404 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 404 insertions(+) create mode 100644 trunk/lib/rexml/text.rb (limited to 'trunk/lib/rexml/text.rb') diff --git a/trunk/lib/rexml/text.rb b/trunk/lib/rexml/text.rb new file mode 100644 index 0000000000..fac5ac3e41 --- /dev/null +++ b/trunk/lib/rexml/text.rb @@ -0,0 +1,404 @@ +require 'rexml/entity' +require 'rexml/doctype' +require 'rexml/child' +require 'rexml/doctype' +require 'rexml/parseexception' + +module REXML + # Represents text nodes in an XML document + class Text < Child + include Comparable + # The order in which the substitutions occur + SPECIALS = [ /&(?!#?[\w-]+;)/u, //u, /"/u, /'/u, /\r/u ] + SUBSTITUTES = ['&', '<', '>', '"', ''', ' '] + # Characters which are substituted in written strings + SLAICEPS = [ '<', '>', '"', "'", '&' ] + SETUTITSBUS = [ /</u, />/u, /"/u, /'/u, /&/u ] + + # If +raw+ is true, then REXML leaves the value alone + attr_accessor :raw + + NEEDS_A_SECOND_CHECK = /(<|&((#{Entity::NAME});|(#0*((?:\d+)|(?:x[a-fA-F0-9]+)));)?)/um + NUMERICENTITY = /�*((?:\d+)|(?:x[a-fA-F0-9]+));/ + VALID_CHAR = [ + 0x9, 0xA, 0xD, + (0x20..0xD7FF), + (0xE000..0xFFFD), + (0x10000..0x10FFFF) + ] + + if String.method_defined? :encode + VALID_XML_CHARS = Regexp.new('^['+ + VALID_CHAR.map { |item| + case item + when Fixnum + [item].pack('U').force_encoding('utf-8') + when Range + [item.first, '-'.ord, item.last].pack('UUU').force_encoding('utf-8') + end + }.join + + ']*$') + else + VALID_XML_CHARS = /^( + [\x09\x0A\x0D\x20-\x7E] # ASCII + | [\xC2-\xDF][\x80-\xBF] # non-overlong 2-byte + | \xE0[\xA0-\xBF][\x80-\xBF] # excluding overlongs + | [\xE1-\xEC\xEE][\x80-\xBF]{2} # straight 3-byte + | \xEF[\x80-\xBE]{2} # + | \xEF\xBF[\x80-\xBD] # excluding U+fffe and U+ffff + | \xED[\x80-\x9F][\x80-\xBF] # excluding surrogates + | \xF0[\x90-\xBF][\x80-\xBF]{2} # planes 1-3 + | [\xF1-\xF3][\x80-\xBF]{3} # planes 4-15 + | \xF4[\x80-\x8F][\x80-\xBF]{2} # plane 16 + )*$/nx; + end + + # Constructor + # +arg+ if a String, the content is set to the String. If a Text, + # the object is shallowly cloned. + # + # +respect_whitespace+ (boolean, false) if true, whitespace is + # respected + # + # +parent+ (nil) if this is a Parent object, the parent + # will be set to this. + # + # +raw+ (nil) This argument can be given three values. + # If true, then the value of used to construct this object is expected to + # contain no unescaped XML markup, and REXML will not change the text. If + # this value is false, the string may contain any characters, and REXML will + # escape any and all defined entities whose values are contained in the + # text. If this value is nil (the default), then the raw value of the + # parent will be used as the raw value for this node. If there is no raw + # value for the parent, and no value is supplied, the default is false. + # Use this field if you have entities defined for some text, and you don't + # want REXML to escape that text in output. + # Text.new( "<&", false, nil, false ) #-> "<&" + # Text.new( "<&", false, nil, false ) #-> "&lt;&amp;" + # Text.new( "<&", false, nil, true ) #-> Parse exception + # Text.new( "<&", false, nil, true ) #-> "<&" + # # Assume that the entity "s" is defined to be "sean" + # # and that the entity "r" is defined to be "russell" + # Text.new( "sean russell" ) #-> "&s; &r;" + # Text.new( "sean russell", false, nil, true ) #-> "sean russell" + # + # +entity_filter+ (nil) This can be an array of entities to match in the + # supplied text. This argument is only useful if +raw+ is set to false. + # Text.new( "sean russell", false, nil, false, ["s"] ) #-> "&s; russell" + # Text.new( "sean russell", false, nil, true, ["s"] ) #-> "sean russell" + # In the last example, the +entity_filter+ argument is ignored. + # + # +pattern+ INTERNAL USE ONLY + def initialize(arg, respect_whitespace=false, parent=nil, raw=nil, + entity_filter=nil, illegal=NEEDS_A_SECOND_CHECK ) + + @raw = false + + if parent + super( parent ) + @raw = parent.raw + else + @parent = nil + end + + @raw = raw unless raw.nil? + @entity_filter = entity_filter + @normalized = @unnormalized = nil + + if arg.kind_of? String + @string = arg.clone + @string.squeeze!(" \n\t") unless respect_whitespace + elsif arg.kind_of? Text + @string = arg.to_s + @raw = arg.raw + elsif + raise "Illegal argument of type #{arg.type} for Text constructor (#{arg})" + end + + @string.gsub!( /\r\n?/, "\n" ) + + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + def parent= parent + super(parent) + Text.check(@string, NEEDS_A_SECOND_CHECK, doctype) if @raw and @parent + end + + # check for illegal characters + def Text.check string, pattern, doctype + + # illegal anywhere + if string !~ VALID_XML_CHARS + if String.method_defined? :encode + string.chars.each do |c| + case c.ord + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + else + string.scan(/[\x00-\x7F]|[\x80-\xBF][\xC0-\xF0]*|[\xC0-\xF0]/n) do |c| + case c.unpack('U') + when *VALID_CHAR + else + raise "Illegal character #{c.inspect} in raw string \"#{string}\"" + end + end + end + end + + # context sensitive + string.scan(pattern) do + if $1[-1] != ?; + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + elsif $1[0] == ?& + if $5 and $5[0] == ?# + case ($5[1] == ?x ? $5[2..-1].to_i(16) : $5[1..-1].to_i) + when *VALID_CHAR + else + raise "Illegal character '#{$1}' in raw string \"#{string}\"" + end + elsif $3 and !SUBSTITUTES.include?($1) + if !doctype or !doctype.entities.has_key?($3) + raise "Undeclared entity '#{$1}' in raw string \"#{string}\"" + end + end + end + end + end + + def node_type + :text + end + + def empty? + @string.size==0 + end + + + def clone + return Text.new(self) + end + + + # Appends text to this text node. The text is appended in the +raw+ mode + # of this text node. + def <<( to_append ) + @string << to_append.gsub( /\r\n?/, "\n" ) + end + + + # +other+ a String or a Text + # +returns+ the result of (to_s <=> arg.to_s) + def <=>( other ) + to_s() <=> other.to_s + end + + def doctype + if @parent + doc = @parent.document + doc.doctype if doc + end + end + + REFERENCE = /#{Entity::REFERENCE}/ + # Returns the string value of this text node. This string is always + # escaped, meaning that it is a valid XML text node string, and all + # entities that can be escaped, have been inserted. This method respects + # the entity filter set in the constructor. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.to_s #-> "< & &s; russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.to_s #-> "< & &s; russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.to_s #-> "sean russell" + def to_s + return @string if @raw + return @normalized if @normalized + + @normalized = Text::normalize( @string, doctype, @entity_filter ) + end + + def inspect + @string.inspect + end + + # Returns the string value of this text. This is the text without + # entities, as it might be used programmatically, or printed to the + # console. This ignores the 'raw' attribute setting, and any + # entity_filter. + # + # # Assume that the entity "s" is defined to be "sean", and that the + # # entity "r" is defined to be "russell" + # t = Text.new( "< & sean russell", false, nil, false, ['s'] ) + # t.value #-> "< & sean russell" + # t = Text.new( "< & &s; russell", false, nil, false ) + # t.value #-> "< & sean russell" + # u = Text.new( "sean russell", false, nil, true ) + # u.value #-> "sean russell" + def value + return @unnormalized if @unnormalized + @unnormalized = Text::unnormalize( @string, doctype ) + end + + # Sets the contents of this text node. This expects the text to be + # unnormalized. It returns self. + # + # e = Element.new( "a" ) + # e.add_text( "foo" ) # foo + # e[0].value = "bar" # bar + # e[0].value = "" # <a> + def value=( val ) + @string = val.gsub( /\r\n?/, "\n" ) + @unnormalized = nil + @normalized = nil + @raw = false + end + + def wrap(string, width, addnewline=false) + # Recursively wrap string at width. + return string if string.length <= width + place = string.rindex(' ', width) # Position in string with last ' ' before cutoff + if addnewline then + return "\n" + string[0,place] + "\n" + wrap(string[place+1..-1], width) + else + return string[0,place] + "\n" + wrap(string[place+1..-1], width) + end + end + + def indent_text(string, level=1, style="\t", indentfirstline=true) + return string if level < 0 + new_string = '' + string.each { |line| + indent_string = style * level + new_line = (indent_string + line).sub(/[\s]+$/,'') + new_string << new_line + } + new_string.strip! unless indentfirstline + return new_string + end + + # == DEPRECATED + # See REXML::Formatters + # + def write( writer, indent=-1, transitive=false, ie_hack=false ) + Kernel.warn("#{self.class.name}.write is deprecated. See REXML::Formatters") + formatter = if indent > -1 + REXML::Formatters::Pretty.new( indent ) + else + REXML::Formatters::Default.new + end + formatter.write( self, writer ) + end + + # FIXME + # This probably won't work properly + def xpath + path = @parent.xpath + path += "/text()" + return path + end + + # Writes out text, substituting special characters beforehand. + # +out+ A String, IO, or any other object supporting <<( String ) + # +input+ the text to substitute and the write out + # + # z=utf8.unpack("U*") + # ascOut="" + # z.each{|r| + # if r < 0x100 + # ascOut.concat(r.chr) + # else + # ascOut.concat(sprintf("&#x%x;", r)) + # end + # } + # puts ascOut + def write_with_substitution out, input + copy = input.clone + # Doing it like this rather than in a loop improves the speed + copy.gsub!( SPECIALS[0], SUBSTITUTES[0] ) + copy.gsub!( SPECIALS[1], SUBSTITUTES[1] ) + copy.gsub!( SPECIALS[2], SUBSTITUTES[2] ) + copy.gsub!( SPECIALS[3], SUBSTITUTES[3] ) + copy.gsub!( SPECIALS[4], SUBSTITUTES[4] ) + copy.gsub!( SPECIALS[5], SUBSTITUTES[5] ) + out << copy + end + + # Reads text, substituting entities + def Text::read_with_substitution( input, illegal=nil ) + copy = input.clone + + if copy =~ illegal + raise ParseException.new( "malformed text: Illegal character #$& in \"#{copy}\"" ) + end if illegal + + copy.gsub!( /\r\n?/, "\n" ) + if copy.include? ?& + copy.gsub!( SETUTITSBUS[0], SLAICEPS[0] ) + copy.gsub!( SETUTITSBUS[1], SLAICEPS[1] ) + copy.gsub!( SETUTITSBUS[2], SLAICEPS[2] ) + copy.gsub!( SETUTITSBUS[3], SLAICEPS[3] ) + copy.gsub!( SETUTITSBUS[4], SLAICEPS[4] ) + copy.gsub!( /�*((?:\d+)|(?:x[a-f0-9]+));/ ) { + m=$1 + #m='0' if m=='' + m = "0#{m}" if m[0] == ?x + [Integer(m)].pack('U*') + } + end + copy + end + + EREFERENCE = /&(?!#{Entity::NAME};)/ + # Escapes all possible entities + def Text::normalize( input, doctype=nil, entity_filter=nil ) + copy = input.to_s + # Doing it like this rather than in a loop improves the speed + #copy = copy.gsub( EREFERENCE, '&' ) + copy = copy.gsub( "&", "&" ) + if doctype + # Replace all ampersands that aren't part of an entity + doctype.entities.each_value do |entity| + copy = copy.gsub( entity.value, + "&#{entity.name};" ) if entity.value and + not( entity_filter and entity_filter.include?(entity) ) + end + else + # Replace all ampersands that aren't part of an entity + DocType::DEFAULT_ENTITIES.each_value do |entity| + copy = copy.gsub(entity.value, "&#{entity.name};" ) + end + end + copy + end + + # Unescapes all possible entities + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil ) + string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { + ref = $& + if ref[1] == ?# + if ref[2] == ?x + [ref[3...-1].to_i(16)].pack('U*') + else + [ref[2...-1].to_i].pack('U*') + end + elsif ref == '&' + '&' + elsif filter and filter.include?( ref[1...-1] ) + ref + elsif doctype + doctype.entity( ref[1...-1] ) or ref + else + entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] + entity_value ? entity_value.value : ref + end + } + end + end +end -- cgit v1.2.3