diff options
Diffstat (limited to 'lib/unicode_normalize')
| -rw-r--r-- | lib/unicode_normalize/normalize.rb | 31 | ||||
| -rw-r--r-- | lib/unicode_normalize/tables.rb | 49 |
2 files changed, 66 insertions, 14 deletions
diff --git a/lib/unicode_normalize/normalize.rb b/lib/unicode_normalize/normalize.rb index e67fad187a..0447df8de7 100644 --- a/lib/unicode_normalize/normalize.rb +++ b/lib/unicode_normalize/normalize.rb @@ -82,16 +82,22 @@ module UnicodeNormalize # :nodoc: ## Canonical Ordering def self.canonical_ordering_one(string) - sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] } - (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort - (0..i).each do |j| - later_class = sorting[j+1].last - if 0<later_class and later_class<sorting[j].last - sorting[j], sorting[j+1] = sorting[j+1], sorting[j] - end + result = '' + unordered = [] + chars = string.chars + n = chars.size + chars.each_with_index do |char, i| + ccc = CLASS_TABLE[char] + if ccc == 0 + unordered.sort!.each { result << chars[it % n] } + unordered.clear + result << char + else + unordered << ccc * n + i end end - return sorting.collect(&:first).join('') + unordered.sort!.each { result << chars[it % n] } + result end ## Normalization Forms for Patterns (not whole Strings) @@ -105,17 +111,22 @@ module UnicodeNormalize # :nodoc: start = nfd_string[0] last_class = CLASS_TABLE[start]-1 accents = '' + result = '' nfd_string[1..-1].each_char do |accent| accent_class = CLASS_TABLE[accent] if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent] start = composite + elsif accent_class == 0 + result << start << accents + start = accent + accents = '' + last_class = -1 else accents << accent last_class = accent_class end end - accents = nfc_one(accents) if accents.length>1 # TODO: change from recursion to loop - hangul_comp_one(start+accents) + hangul_comp_one(result+start+accents) end def self.normalize(string, form = :nfc) diff --git a/lib/unicode_normalize/tables.rb b/lib/unicode_normalize/tables.rb index b5b708defd..dd5d3499b8 100644 --- a/lib/unicode_normalize/tables.rb +++ b/lib/unicode_normalize/tables.rb @@ -1,8 +1,8 @@ # coding: us-ascii # frozen_string_literal: true -Encoding::UNICODE_VERSION == "16.0.0" or - raise "Unicode version mismatch: 16.0.0 expected but #{Encoding::UNICODE_VERSION}" +Encoding::UNICODE_VERSION == "17.0.0" or + raise "Unicode version mismatch: 17.0.0 expected but #{Encoding::UNICODE_VERSION}" # automatically generated by template/unicode_norm_gen.tmpl @@ -99,7 +99,8 @@ module UnicodeNormalize # :nodoc: "\u1A75-\u1A7C" \ "\u1A7F" \ "\u1AB0-\u1ABD" \ - "\u1ABF-\u1ACE" \ + "\u1ABF-\u1ADD" \ + "\u1AE0-\u1AEB" \ "\u1B34\u1B35" \ "\u1B44" \ "\u1B6B-\u1B73" \ @@ -154,6 +155,7 @@ module UnicodeNormalize # :nodoc: "\u{10D24}-\u{10D27}" \ "\u{10D69}-\u{10D6D}" \ "\u{10EAB}\u{10EAC}" \ + "\u{10EFA}\u{10EFB}" \ "\u{10EFD}-\u{10EFF}" \ "\u{10F46}-\u{10F50}" \ "\u{10F82}-\u{10F85}" \ @@ -230,6 +232,10 @@ module UnicodeNormalize # :nodoc: "\u{1E2EC}-\u{1E2EF}" \ "\u{1E4EC}-\u{1E4EF}" \ "\u{1E5EE}\u{1E5EF}" \ + "\u{1E6E3}" \ + "\u{1E6E6}" \ + "\u{1E6EE}\u{1E6EF}" \ + "\u{1E6F5}" \ "\u{1E8D0}-\u{1E8D6}" \ "\u{1E944}-\u{1E94A}" \ "]" @@ -1460,7 +1466,7 @@ module UnicodeNormalize # :nodoc: "\u3280-\u33FF" \ "\uA69C\uA69D" \ "\uA770" \ - "\uA7F2-\uA7F4" \ + "\uA7F1-\uA7F4" \ "\uA7F8\uA7F9" \ "\uAB5C-\uAB5F" \ "\uAB69" \ @@ -2019,6 +2025,33 @@ module UnicodeNormalize # :nodoc: "\u1ACC"=>230, "\u1ACD"=>230, "\u1ACE"=>230, + "\u1ACF"=>230, + "\u1AD0"=>230, + "\u1AD1"=>230, + "\u1AD2"=>230, + "\u1AD3"=>230, + "\u1AD4"=>230, + "\u1AD5"=>230, + "\u1AD6"=>230, + "\u1AD7"=>230, + "\u1AD8"=>230, + "\u1AD9"=>230, + "\u1ADA"=>230, + "\u1ADB"=>230, + "\u1ADC"=>230, + "\u1ADD"=>220, + "\u1AE0"=>230, + "\u1AE1"=>230, + "\u1AE2"=>230, + "\u1AE3"=>230, + "\u1AE4"=>230, + "\u1AE5"=>230, + "\u1AE6"=>220, + "\u1AE7"=>230, + "\u1AE8"=>230, + "\u1AE9"=>230, + "\u1AEA"=>230, + "\u1AEB"=>234, "\u1B34"=>7, "\u1B44"=>9, "\u1B6B"=>230, @@ -2293,6 +2326,8 @@ module UnicodeNormalize # :nodoc: "\u{10D6D}"=>230, "\u{10EAB}"=>230, "\u{10EAC}"=>230, + "\u{10EFA}"=>220, + "\u{10EFB}"=>220, "\u{10EFD}"=>220, "\u{10EFE}"=>220, "\u{10EFF}"=>220, @@ -2479,6 +2514,11 @@ module UnicodeNormalize # :nodoc: "\u{1E4EF}"=>230, "\u{1E5EE}"=>230, "\u{1E5EF}"=>220, + "\u{1E6E3}"=>230, + "\u{1E6E6}"=>230, + "\u{1E6EE}"=>230, + "\u{1E6EF}"=>230, + "\u{1E6F5}"=>230, "\u{1E8D0}"=>220, "\u{1E8D1}"=>220, "\u{1E8D2}"=>220, @@ -5922,6 +5962,7 @@ module UnicodeNormalize # :nodoc: "\uA69C"=>"\u044A", "\uA69D"=>"\u044C", "\uA770"=>"\uA76F", + "\uA7F1"=>"S", "\uA7F2"=>"C", "\uA7F3"=>"F", "\uA7F4"=>"Q", |
