diff options
Diffstat (limited to 'lib/unicode_normalize/normalize.rb')
| -rw-r--r-- | lib/unicode_normalize/normalize.rb | 53 |
1 files changed, 40 insertions, 13 deletions
diff --git a/lib/unicode_normalize/normalize.rb b/lib/unicode_normalize/normalize.rb index 18080ce03d..0447df8de7 100644 --- a/lib/unicode_normalize/normalize.rb +++ b/lib/unicode_normalize/normalize.rb @@ -1,11 +1,27 @@ # coding: utf-8 +# frozen_string_literal: false # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp) -require 'unicode_normalize/tables.rb' +# This file, the companion file tables.rb (autogenerated), and the module, +# constants, and method defined herein are part of the implementation of the +# built-in String class, not part of the standard library. They should +# therefore never be gemified. They implement the methods +# String#unicode_normalize, String#unicode_normalize!, and String#unicode_normalized?. +# +# They are placed here because they are written in Ruby. They are loaded on +# demand when any of the three methods mentioned above is executed for the +# first time. This reduces the memory footprint and startup time for scripts +# and applications that do not use those methods. +# +# The name and even the existence of the module UnicodeNormalize and all of its +# content are purely an implementation detail, and should not be exposed in +# any test or spec or otherwise. +require_relative 'tables' -module UnicodeNormalize +# :stopdoc: +module UnicodeNormalize # :nodoc: ## Constant for max hash capacity to avoid DoS attack MAX_HASH_LENGTH = 18000 # enough for all test cases, otherwise tests get slow @@ -54,7 +70,7 @@ module UnicodeNormalize if length>1 and 0 <= (lead =string[0].ord-LBASE) and lead < LCOUNT and 0 <= (vowel=string[1].ord-VBASE) and vowel < VCOUNT lead_vowel = SBASE + (lead * VCOUNT + vowel) * TCOUNT - if length>2 and 0 <= (trail=string[2].ord-TBASE) and trail < TCOUNT + if length>2 and 0 < (trail=string[2].ord-TBASE) and trail < TCOUNT (lead_vowel + trail).chr(Encoding::UTF_8) + string[3..-1] else lead_vowel.chr(Encoding::UTF_8) + string[2..-1] @@ -66,16 +82,22 @@ module UnicodeNormalize ## Canonical Ordering def self.canonical_ordering_one(string) - sorting = string.each_char.collect { |c| [c, CLASS_TABLE[c]] } - (sorting.length-2).downto(0) do |i| # almost, but not exactly bubble sort - (0..i).each do |j| - later_class = sorting[j+1].last - if 0<later_class and later_class<sorting[j].last - sorting[j], sorting[j+1] = sorting[j+1], sorting[j] - end + result = '' + unordered = [] + chars = string.chars + n = chars.size + chars.each_with_index do |char, i| + ccc = CLASS_TABLE[char] + if ccc == 0 + unordered.sort!.each { result << chars[it % n] } + unordered.clear + result << char + else + unordered << ccc * n + i end end - return sorting.collect(&:first).join('') + unordered.sort!.each { result << chars[it % n] } + result end ## Normalization Forms for Patterns (not whole Strings) @@ -89,16 +111,22 @@ module UnicodeNormalize start = nfd_string[0] last_class = CLASS_TABLE[start]-1 accents = '' + result = '' nfd_string[1..-1].each_char do |accent| accent_class = CLASS_TABLE[accent] if last_class<accent_class and composite = COMPOSITION_TABLE[start+accent] start = composite + elsif accent_class == 0 + result << start << accents + start = accent + accents = '' + last_class = -1 else accents << accent last_class = accent_class end end - hangul_comp_one(start+accents) + hangul_comp_one(result+start+accents) end def self.normalize(string, form = :nfc) @@ -156,5 +184,4 @@ module UnicodeNormalize raise Encoding::CompatibilityError, "Unicode Normalization not appropriate for #{encoding}" end end - end # module |
