3 files changed, 122 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index 23df4e4f93..21ea9e510b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+Sun Feb 23 17:55:50 2014  Kouhei Sutou  <kou@cozmixng.org>
+
+	* lib/rexml/xmltokens.rb: Add missing non ASCII valid characters
+	  to element name characters. Now, REXML name tokens exactly
+	  match "[5] Name" in the XML spec and "[4] NCName" in the
+	  Namespaces in XML spec. See comment about the details.
+	  [Bug #9539]  [ruby-core:60901]
+	  Reported by Mario Barcala. Thanks!!!
+
+	* test/rexml/xpath/test_node.rb: Add tests for the above case.
+
 Sun Feb 23 12:18:54 2014  Nobuyoshi Nakada  <nobu@ruby-lang.org>
 
 	* ext/socket/raddrinfo.c (inet_pton): use rb_w32_inet_pton, instead of
diff --git a/lib/rexml/xmltokens.rb b/lib/rexml/xmltokens.rb
index 7dc4e8b2ba..4d4dd27f2d 100644
--- a/lib/rexml/xmltokens.rb
+++ b/lib/rexml/xmltokens.rb
@@ -2,12 +2,78 @@ module REXML
   # Defines a number of tokens used for parsing XML.  Not for general
   # consumption.
   module XMLTokens
-    NCNAME_STR= '[\w:][\-\w.]*'
-    NAME_STR= "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+    # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+    #
+    #   [4] NameStartChar ::=
+    #         ":" |
+    #         [A-Z] |
+    #         "_" |
+    #         [a-z] |
+    #         [#xC0-#xD6] |
+    #         [#xD8-#xF6] |
+    #         [#xF8-#x2FF] |
+    #         [#x370-#x37D] |
+    #         [#x37F-#x1FFF] |
+    #         [#x200C-#x200D] |
+    #         [#x2070-#x218F] |
+    #         [#x2C00-#x2FEF] |
+    #         [#x3001-#xD7FF] |
+    #         [#xF900-#xFDCF] |
+    #         [#xFDF0-#xFFFD] |
+    #         [#x10000-#xEFFFF]
+    name_start_chars = [
+      ":",
+      "A-Z",
+      "_",
+      "a-z",
+      "\\u00C0-\\u00D6",
+      "\\u00D8-\\u00F6",
+      "\\u00F8-\\u02FF",
+      "\\u0370-\\u037D",
+      "\\u037F-\\u1FFF",
+      "\\u200C-\\u200D",
+      "\\u2070-\\u218F",
+      "\\u2C00-\\u2FEF",
+      "\\u3001-\\uD7FF",
+      "\\uF900-\\uFDCF",
+      "\\uFDF0-\\uFFFD",
+      "\\u{10000}-\\u{EFFFF}",
+    ]
+    # From http://www.w3.org/TR/REC-xml/#sec-common-syn
+    #
+    #   [4a] NameChar ::=
+    #      NameStartChar |
+    #      "-" |
+    #      "." |
+    #      [0-9] |
+    #      #xB7 |
+    #      [#x0300-#x036F] |
+    #      [#x203F-#x2040]
+    name_chars = name_start_chars + [
+      "\\-",
+      "\\.",
+      "0-9",
+      "\\u00B7",
+      "\\u0300-\\u036F",
+      "\\u203F-\\u2040",
+    ]
+    NAME_START_CHAR = "[#{name_start_chars.join('')}]"
+    NAME_CHAR = "[#{name_chars.join('')}]"
+    NAMECHAR = NAME_CHAR # deprecated. Use NAME_CHAR instead.
 
-    NAMECHAR = '[\-\w\.:]'
-    NAME = "([\\w:]#{NAMECHAR}*)"
-    NMTOKEN = "(?:#{NAMECHAR})+"
+    # From http://www.w3.org/TR/xml-names11/#NT-NCName
+    #
+    #   [6] NCNameStartChar ::= NameStartChar - ':'
+    ncname_start_chars = name_start_chars - [":"]
+    # From http://www.w3.org/TR/xml-names11/#NT-NCName
+    #
+    #   [5] NCNameChar ::= NameChar - ':'
+    ncname_chars = name_chars - [":"]
+    NCNAME_STR = "[#{ncname_start_chars.join('')}][#{ncname_chars.join('')}]*"
+    NAME_STR = "(?:#{NCNAME_STR}:)?#{NCNAME_STR}"
+
+    NAME = "(#{NAME_START_CHAR}#{NAME_CHAR}*)"
+    NMTOKEN = "(?:#{NAME_CHAR})+"
     NMTOKENS = "#{NMTOKEN}(\\s+#{NMTOKEN})*"
     REFERENCE = "(?:&#{NAME};|&#\\d+;|&#x[0-9a-fA-F]+;)"
 
diff --git a/test/rexml/xpath/test_node.rb b/test/rexml/xpath/test_node.rb
new file mode 100644
index 0000000000..db7a2efca6
--- /dev/null
+++ b/test/rexml/xpath/test_node.rb
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+
+require_relative "../rexml_test_utils"
+
+require "rexml/document"
+
+class TestXPathNode < Test::Unit::TestCase
+  def matches(xml, xpath)
+    document = REXML::Document.new(xml)
+    REXML::XPath.each(document, xpath).collect(&:to_s)
+  end
+
+  class TestQName < self
+    def test_ascii
+      xml = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+  <ascii>
+    <child>child</child>
+  </ascii>
+</root>
+      XML
+      assert_equal(["<child>child</child>"],
+                   matches(xml, "/root/ascii/child"))
+    end
+
+    def test_non_ascii
+      xml = <<-XML
+<?xml version="1.0" encoding="UTF-8"?>
+<root>
+  <non-àscii>
+    <child>child</child>
+  </non-àscii>
+</root>
+    XML
+      assert_equal(["<child>child</child>"],
+                   matches(xml, "/root/non-àscii/child"))
+    end
+  end
+end