summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortompng <tomoyapenguin@gmail.com>2025-09-09 21:21:22 +0900
committerMari Imaizumi <mariimaizumi5@gmail.com>2025-10-25 21:19:29 +0900
commit377aa2a336cc700485c699ac49330f2a58b74906 (patch)
treea384fbeee12d8ace57f135eb7ad18131ba7ae317
parent31e14ac7dadc99851fefbb5d5d4232ba9f568f1b (diff)
Improve performance of UnicodeNormalize.canonical_ordering_one
Use array_of_integer.sort! instead of buble-sort-like algorithm
-rw-r--r--lib/unicode_normalize/normalize.rb22
-rw-r--r--test/test_unicode_normalize.rb19
2 files changed, 33 insertions, 8 deletions
diff --git a/lib/unicode_normalize/normalize.rb b/lib/unicode_normalize/normalize.rb
index e67fad187a..7872f8a0bc 100644
--- a/lib/unicode_normalize/normalize.rb
+++ b/lib/unicode_normalize/normalize.rb
@@ -82,16 +82,22 @@ module UnicodeNormalize # :nodoc:
## Canonical Ordering
def self.canonical_ordering_one(string)
- sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] }
- (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort
- (0..i).each do |j|
- later_class = sorting[j+1].last
- if 0<later_class and later_class<sorting[j].last
- sorting[j], sorting[j+1] = sorting[j+1], sorting[j]
- end
+ result = ''
+ unordered = []
+ chars = string.chars
+ n = chars.size
+ chars.each_with_index do |char, i|
+ ccc = CLASS_TABLE[char]
+ if ccc == 0
+ unordered.sort!.each { result << chars[it % n] }
+ unordered.clear
+ result << char
+ else
+ unordered << ccc * n + i
end
end
- return sorting.collect(&:first).join('')
+ unordered.sort!.each { result << chars[it % n] }
+ result
end
## Normalization Forms for Patterns (not whole Strings)
diff --git a/test/test_unicode_normalize.rb b/test/test_unicode_normalize.rb
index 8789ed92d2..98f46551d3 100644
--- a/test/test_unicode_normalize.rb
+++ b/test/test_unicode_normalize.rb
@@ -209,4 +209,23 @@ class TestUnicodeNormalize
assert_equal true, ascii_string.unicode_normalized?(:nfkc)
assert_equal true, ascii_string.unicode_normalized?(:nfkd)
end
+
+ def test_canonical_ordering
+ a = "\u03B1\u0313\u0300\u0345"
+ a_unordered1 = "\u03B1\u0345\u0313\u0300"
+ a_unordered2 = "\u03B1\u0313\u0345\u0300"
+ u1 = "U\u0308\u0304"
+ u2 = "U\u0304\u0308"
+ s = "s\u0323\u0307"
+ s_unordered = "s\u0307\u0323"
+ o = "\u{1611e}\u{1611e}\u{1611f}"
+ # Actual cases called through String#unicode_normalize
+ assert_equal(s + o, UnicodeNormalize.canonical_ordering_one(s_unordered + o))
+ assert_equal(a[1..], UnicodeNormalize.canonical_ordering_one(a_unordered1[1..]))
+ assert_equal(a[1..] + o, UnicodeNormalize.canonical_ordering_one(a_unordered2[1..] + o))
+ # Artificial cases
+ assert_equal(a + u1 + o + u2 + s, UnicodeNormalize.canonical_ordering_one(a + u1 + o + u2 + s))
+ assert_equal(s[1..] + a + a, UnicodeNormalize.canonical_ordering_one(s_unordered[1..] + a_unordered1 + a_unordered2))
+ assert_equal(o + s + u1 + a + o + a + u2 + o, UnicodeNormalize.canonical_ordering_one(o + s_unordered + u1 + a_unordered1 + o + a_unordered2 + u2 + o))
+ end
end