diff options
| author | tompng <tomoyapenguin@gmail.com> | 2025-09-09 21:21:22 +0900 |
|---|---|---|
| committer | Mari Imaizumi <mariimaizumi5@gmail.com> | 2025-10-25 21:19:29 +0900 |
| commit | 377aa2a336cc700485c699ac49330f2a58b74906 (patch) | |
| tree | a384fbeee12d8ace57f135eb7ad18131ba7ae317 | |
| parent | 31e14ac7dadc99851fefbb5d5d4232ba9f568f1b (diff) | |
Improve performance of UnicodeNormalize.canonical_ordering_one
Use array_of_integer.sort! instead of buble-sort-like algorithm
| -rw-r--r-- | lib/unicode_normalize/normalize.rb | 22 | ||||
| -rw-r--r-- | test/test_unicode_normalize.rb | 19 |
2 files changed, 33 insertions, 8 deletions
diff --git a/lib/unicode_normalize/normalize.rb b/lib/unicode_normalize/normalize.rb index e67fad187a..7872f8a0bc 100644 --- a/lib/unicode_normalize/normalize.rb +++ b/lib/unicode_normalize/normalize.rb @@ -82,16 +82,22 @@ module UnicodeNormalize # :nodoc: ## Canonical Ordering def self.canonical_ordering_one(string) - sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] } - (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort - (0..i).each do |j| - later_class = sorting[j+1].last - if 0<later_class and later_class<sorting[j].last - sorting[j], sorting[j+1] = sorting[j+1], sorting[j] - end + result = '' + unordered = [] + chars = string.chars + n = chars.size + chars.each_with_index do |char, i| + ccc = CLASS_TABLE[char] + if ccc == 0 + unordered.sort!.each { result << chars[it % n] } + unordered.clear + result << char + else + unordered << ccc * n + i end end - return sorting.collect(&:first).join('') + unordered.sort!.each { result << chars[it % n] } + result end ## Normalization Forms for Patterns (not whole Strings) diff --git a/test/test_unicode_normalize.rb b/test/test_unicode_normalize.rb index 8789ed92d2..98f46551d3 100644 --- a/test/test_unicode_normalize.rb +++ b/test/test_unicode_normalize.rb @@ -209,4 +209,23 @@ class TestUnicodeNormalize assert_equal true, ascii_string.unicode_normalized?(:nfkc) assert_equal true, ascii_string.unicode_normalized?(:nfkd) end + + def test_canonical_ordering + a = "\u03B1\u0313\u0300\u0345" + a_unordered1 = "\u03B1\u0345\u0313\u0300" + a_unordered2 = "\u03B1\u0313\u0345\u0300" + u1 = "U\u0308\u0304" + u2 = "U\u0304\u0308" + s = "s\u0323\u0307" + s_unordered = "s\u0307\u0323" + o = "\u{1611e}\u{1611e}\u{1611f}" + # Actual cases called through String#unicode_normalize + assert_equal(s + o, UnicodeNormalize.canonical_ordering_one(s_unordered + o)) + assert_equal(a[1..], UnicodeNormalize.canonical_ordering_one(a_unordered1[1..])) + assert_equal(a[1..] + o, UnicodeNormalize.canonical_ordering_one(a_unordered2[1..] + o)) + # Artificial cases + assert_equal(a + u1 + o + u2 + s, UnicodeNormalize.canonical_ordering_one(a + u1 + o + u2 + s)) + assert_equal(s[1..] + a + a, UnicodeNormalize.canonical_ordering_one(s_unordered[1..] + a_unordered1 + a_unordered2)) + assert_equal(o + s + u1 + a + o + a + u2 + o, UnicodeNormalize.canonical_ordering_one(o + s_unordered + u1 + a_unordered1 + o + a_unordered2 + u2 + o)) + end end |
