summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog11
-rw-r--r--lib/rexml/xmltokens.rb76
-rw-r--r--test/rexml/xpath/test_node.rb40
3 files changed, 122 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index 23df4e4f93..21ea9e510b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+Sun Feb 23 17:55:50 2014 Kouhei Sutou <kou@cozmixng.org>
+
+ * lib/rexml/xmltokens.rb: Add missing non ASCII valid characters
+ to element name characters. Now, REXML name tokens exactly
+ match "[5] Name" in the XML spec and "[4] NCName" in the
+ Namespaces in XML spec. See comment about the details.
+ [Bug #9539] [ruby-core:60901]
+ Reported by Mario Barcala. Thanks!!!
+
+ * test/rexml/xpath/test_node.rb: Add tests for the above case.
+
Sun Feb 23 12:18:54 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
* ext/socket/raddrinfo.c (inet_pton): use rb_w32_inet_pton, instead of
diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb
index 7dc4e8b2ba..4d4dd27f2d 100644
--- a/lib/rexml/xmltokens.rb
+++ b/lib/rexml/xmltokens.rb
@@ -2,12 +2,78 @@ module REXML
# Defines a number of tokens used for parsing XML. Not for general
# consumption.
module XMLTokens
- NCNAME_STR= '[\w:][\-\w.]*'
- NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+ # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+ #
+ # [4] NameStartChar ::=
+ # ":" |
+ # [A-Z] |
+ # "_" |
+ # [a-z] |
+ # [#xC0-#xD6] |
+ # [#xD8-#xF6] |
+ # [#xF8-#x2FF] |
+ # [#x370-#x37D] |
+ # [#x37F-#x1FFF] |
+ # [#x200C-#x200D] |
+ # [#x2070-#x218F] |
+ # [#x2C00-#x2FEF] |
+ # [#x3001-#xD7FF] |
+ # [#xF900-#xFDCF] |
+ # [#xFDF0-#xFFFD] |
+ # [#x10000-#xEFFFF]
+ name_start_chars = [
+ ":",
+ "A-Z",
+ "_",
+ "a-z",
+ "\\u00C0-\\u00D6",
+ "\\u00D8-\\u00F6",
+ "\\u00F8-\\u02FF",
+ "\\u0370-\\u037D",
+ "\\u037F-\\u1FFF",
+ "\\u200C-\\u200D",
+ "\\u2070-\\u218F",
+ "\\u2C00-\\u2FEF",
+ "\\u3001-\\uD7FF",
+ "\\uF900-\\uFDCF",
+ "\\uFDF0-\\uFFFD",
+ "\\u{10000}-\\u{EFFFF}",
+ ]
+ # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+ #
+ # [4a] NameChar ::=
+ # NameStartChar |
+ # "-" |
+ # "." |
+ # [0-9] |
+ # #xB7 |
+ # [#x0300-#x036F] |
+ # [#x203F-#x2040]
+ name_chars = name_start_chars + [
+ "\\-",
+ "\\.",
+ "0-9",
+ "\\u00B7",
+ "\\u0300-\\u036F",
+ "\\u203F-\\u2040",
+ ]
+ NAME_START_CHAR = "[#{name_start_chars.join('')}]"
+ NAME_CHAR = "[#{name_chars.join('')}]"
+ NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead.
- NAMECHAR = '[\-\w\.:]'
- NAME = "([\\w:]#{NAMECHAR}*)"
- NMTOKEN = "(?:#{NAMECHAR})+"
+ # From http://www.w3.org/TR/xml-names11/#NT-NCName
+ #
+ # [6] NCNameStartChar ::= NameStartChar - ':'
+ ncname_start_chars = name_start_chars - [":"]
+ # From http://www.w3.org/TR/xml-names11/#NT-NCName
+ #
+ # [5] NCNameChar ::= NameChar - ':'
+ ncname_chars = name_chars - [":"]
+ NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*"
+ NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+
+ NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)"
+ NMTOKEN = "(?:#{NAME_CHAR})+"
NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
diff --git a/test/rexml/xpath/test_node.rb b/test/rexml/xpath/test_node.rb
new file mode 100644
index 0000000000..db7a2efca6
--- /dev/null
+++ b/test/rexml/xpath/test_node.rb
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+require_relative "../rexml_test_utils"
+
+require "rexml/document"
+
+class TestXPathNode < Test::Unit::TestCase
+ def matches(xml, xpath)
+ document = REXML::Document.new(xml)
+ REXML::XPath.each(document, xpath).collect(&:to_s)
+ end
+
+ class TestQName < self
+ def test_ascii
+ xml = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <ascii>
+ <child>child</child>
+ </ascii>
+</root>
+ XML
+ assert_equal(["<child>child</child>"],
+ matches(xml, "/root/ascii/child"))
+ end
+
+ def test_non_ascii
+ xml = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+ <non-àscii>
+ <child>child</child>
+ </non-àscii>
+</root>
+ XML
+ assert_equal(["<child>child</child>"],
+ matches(xml, "/root/non-àscii/child"))
+ end
+ end
+end