1 files changed, 44 insertions, 25 deletions
diff --git a/lib/unicode_normalize/normalize.rb b/lib/unicode_normalize/normalize.rb
index 1511f75f8a..0447df8de7 100644
--- a/lib/unicode_normalize/normalize.rb
+++ b/lib/unicode_normalize/normalize.rb
@@ -1,11 +1,27 @@
 # coding: utf-8
+# frozen_string_literal: false
 
 # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
 
-require 'unicode_normalize/tables.rb'
-
-
-module UnicodeNormalize
+# This file, the companion file tables.rb (autogenerated), and the module,
+# constants, and method defined herein are part of the implementation of the
+# built-in String class, not part of the standard library. They should
+# therefore never be gemified. They implement the methods
+# String#unicode_normalize, String#unicode_normalize!, and String#unicode_normalized?.
+#
+# They are placed here because they are written in Ruby. They are loaded on
+# demand when any of the three methods mentioned above is executed for the
+# first time. This reduces the memory footprint and startup time for scripts
+# and applications that do not use those methods.
+#
+# The name and even the existence of the module UnicodeNormalize and all of its
+# content are purely an implementation detail, and should not be exposed in
+# any test or spec or otherwise.
+
+require_relative 'tables'
+
+# :stopdoc:
+module UnicodeNormalize  # :nodoc:
   ## Constant for max hash capacity to avoid DoS attack
   MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow
 
@@ -21,10 +37,6 @@ module UnicodeNormalize
                          hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
                          hash[key] = nfc_one(key)
                        end
-  NF_HASH_K = Hash.new do |hash, key|
-                         hash.shift if hash.length>MAX_HASH_LENGTH # prevent DoS attack
-                         hash[key] = nfkd_one(key)
-                       end
 
   ## Constants For Hangul
   # for details such as the meaning of the identifiers below, please see
@@ -58,7 +70,7 @@ module UnicodeNormalize
     if length>1 and 0 <= (lead =string[0].ord-LBASE) and lead  < LCOUNT and
                     0 <= (vowel=string[1].ord-VBASE) and vowel < VCOUNT
       lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT
-      if length>2 and 0 <= (trail=string[2].ord-TBASE) and trail < TCOUNT
+      if length>2 and 0 < (trail=string[2].ord-TBASE) and trail < TCOUNT
         (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1]
       else
         lead_vowel.chr(Encoding::UTF_8) + string[2..-1]
@@ -70,16 +82,22 @@ module UnicodeNormalize
 
   ## Canonical Ordering
   def self.canonical_ordering_one(string)
-    sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
-    (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
-      (0..i).each do |j|
-        later_class = sorting[j+1].last
-        if 0<later_class and later_class<sorting[j].last
-          sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
-        end
+    result = ''
+    unordered = []
+    chars = string.chars
+    n = chars.size
+    chars.each_with_index do |char, i|
+      ccc = CLASS_TABLE[char]
+      if ccc == 0
+        unordered.sort!.each { result << chars[it % n] }
+        unordered.clear
+        result << char
+      else
+        unordered << ccc * n + i
       end
     end
-    return sorting.collect(&:first).join('')
+    unordered.sort!.each { result << chars[it % n] }
+    result
   end
 
   ## Normalization Forms for Patterns (not whole Strings)
@@ -88,25 +106,27 @@ module UnicodeNormalize
     canonical_ordering_one(hangul_decomp_one(string))
   end
 
-  def self.nfkd_one(string)
-    string.chars.map! {|c| KOMPATIBLE_TABLE[c] || c}.join('')
-  end
-
   def self.nfc_one(string)
     nfd_string = nfd_one string
     start = nfd_string[0]
     last_class = CLASS_TABLE[start]-1
     accents = ''
+    result = ''
     nfd_string[1..-1].each_char do |accent|
       accent_class = CLASS_TABLE[accent]
       if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent]
         start = composite
+      elsif accent_class == 0
+        result << start << accents
+        start = accent
+        accents = ''
+        last_class = -1
       else
         accents << accent
         last_class = accent_class
       end
     end
-    hangul_comp_one(start+accents)
+    hangul_comp_one(result+start+accents)
   end
 
   def self.normalize(string, form = :nfc)
@@ -119,9 +139,9 @@ module UnicodeNormalize
       when :nfd then
         string.gsub REGEXP_D, NF_HASH_D
       when :nfkc then
-        string.gsub(REGEXP_K, NF_HASH_K).gsub REGEXP_C, NF_HASH_C
+        string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_C, NF_HASH_C)
       when :nfkd then
-        string.gsub(REGEXP_K, NF_HASH_K).gsub REGEXP_D, NF_HASH_D
+        string.gsub(REGEXP_K, KOMPATIBLE_TABLE).gsub(REGEXP_D, NF_HASH_D)
       else
         raise ArgumentError, "Invalid normalization form #{form}."
       end
@@ -164,5 +184,4 @@ module UnicodeNormalize
       raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}"
     end
   end
-
 end # module