summaryrefslogtreecommitdiff
path: root/tool
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-10-06 01:27:34 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-10-06 01:27:34 +0000
commit33447b80d52f395b26c31a907648503129b1d077 (patch)
treefbc0175a4b9fdf64c62dbe27db1761017301e444 /tool
parent0fb67d59b2279540d99333ef1ef601e826fdf5d6 (diff)
tool/unicode_norm_gen.rb: Data generation script imported from
https://github.com/duerst/eprun/blob/master/lib/generate.rb git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@47808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool')
-rw-r--r--tool/unicode_norm_gen.rb178
1 files changed, 178 insertions, 0 deletions
diff --git a/tool/unicode_norm_gen.rb b/tool/unicode_norm_gen.rb
new file mode 100644
index 0000000000..90eba75f3a
--- /dev/null
+++ b/tool/unicode_norm_gen.rb
@@ -0,0 +1,178 @@
+# coding: utf-8
+
+# Copyright 2010-2013 Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+# available under the same licence as Ruby itself
+# (see http://www.ruby-lang.org/en/LICENSE.txt)
+
+class Integer
+ def to_UTF8()
+ if self>0xFFFF
+ "\\u{#{to_s(16).upcase}}"
+ elsif CombiningClass[self] or self=='\\'.ord or self=='"'.ord
+ "\\u#{to_s(16).upcase.rjust(4, '0')}"
+ else
+ chr Encoding::UTF_8
+ end
+ end
+end
+
+class Array
+ def line_slice (new_line) # joins items, 16 items per line
+ each_slice(16).collect(&:join).join new_line
+ end
+
+ def to_UTF8() collect(&:to_UTF8).join end
+
+ def to_regexp_chars # converts an array of Integers to character ranges
+ sort.inject([]) do |ranges, value|
+ if ranges.last and ranges.last[1]+1>=value
+ ranges.last[1] = value
+ ranges
+ else
+ ranges << [value, value]
+ end
+ end.collect do |first, last|
+ case last-first
+ when 0
+ first.to_UTF8
+ when 1
+ first.to_UTF8 + last.to_UTF8
+ else
+ first.to_UTF8 + '-' + last.to_UTF8
+ end
+ end.line_slice "\" +\n \""
+ end
+end
+
+class Hash
+ def to_hash_string
+ collect do |key, value|
+ "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\", "
+ end.line_slice "\n "
+ end
+end
+
+# read the file 'CompositionExclusions.txt'
+composition_exclusions = IO.readlines("../data/CompositionExclusions.txt")
+ .select { |line| line =~ /^[A-Z0-9]{4,5}/ }
+ .collect { |line| line.split(' ').first.hex }
+
+decomposition_table = {}
+kompatible_table = {}
+CombiningClass = {} # constant to allow use in Integer#to_UTF8
+
+# read the file 'UnicodeData.txt'
+IO.foreach("../data/UnicodeData.txt") do |line|
+ codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
+
+ case decomposition
+ when /^[0-9A-F]/
+ decomposition_table[codepoint.hex] = decomposition.split(' ').collect(&:hex)
+ when /^</
+ kompatible_table[codepoint.hex] = decomposition.split(' ').drop(1).collect(&:hex)
+ end
+ CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0"
+
+ if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
+ warn "Unexpected: Character range with data relevant to normalization!"
+ end
+end
+
+# calculate compositions from decompositions
+composition_table = decomposition_table.reject do |character, decomposition|
+ composition_exclusions.member? character or # predefined composition exclusion
+ decomposition.length<=1 or # Singleton Decomposition
+ CombiningClass[character] or # character is not a Starter
+ CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter
+end.invert
+
+# recalculate composition_exclusions
+composition_exclusions = decomposition_table.keys - composition_table.values
+
+accent_array = CombiningClass.keys + composition_table.keys.collect(&:last)
+
+composition_starters = composition_table.keys.collect(&:first)
+
+hangul_no_trailing = 0xAC00.step(0xD7A3, 28).to_a
+
+# expand decomposition table values
+decomposition_table.each do |key, value|
+ position = 0
+ while position < value.length
+ if decomposition = decomposition_table[value[position]]
+ decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
+ value[position, 1] = decomposition
+ else
+ position += 1
+ end
+ end
+end
+
+# deal with relationship between canonical and kompatibility decompositions
+decomposition_table.each do |key, value|
+ value = value.dup
+ expanded = false
+ position = 0
+ while position < value.length
+ if decomposition = kompatible_table[value[position]]
+ value[position, 1] = decomposition
+ expanded = true
+ else
+ position += 1
+ end
+ end
+ kompatible_table[key] = value if expanded
+end
+
+class_table_str = CombiningClass.collect do |key, value|
+ "\"#{key.to_UTF8}\"=>#{value}, "
+end.line_slice "\n "
+
+# generate normalization tables file
+open("normalize_tables.rb", "w").print <<MAPPING_TABLE_FILE_END
+# coding: utf-8
+
+# automatically generated by generate.rb
+
+module Normalize
+ ACCENTS = "
+ [#{accent_array.to_regexp_chars}]
+ "
+ REGEXP_D_STRING = " # composition starters and composition exclusions
+ [#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{ACCENTS}*
+ | # characters that can be the result of a composition, except composition starters
+ [#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{ACCENTS}+
+ | # precomposed Hangul syllables
+ [\\u{AC00}-\\u{D7A4}]
+ "
+ REGEXP_C_STRING = " # composition exclusions
+ [#{composition_exclusions.to_regexp_chars}]\#{ACCENTS}*
+ | # composition starters and characters that can be the result of a composition
+ [#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{ACCENTS}+
+ | # Hangul syllables with separate trailer
+ [#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]
+ | # decomposed Hangul syllables
+ [\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?
+ "
+ REGEXP_K_STRING = "
+ [#{kompatible_table.keys.to_regexp_chars}]
+ "
+
+ CLASS_TABLE = {
+ #{class_table_str}
+ }
+ CLASS_TABLE.default = 0
+
+ DECOMPOSITION_TABLE = {
+ #{decomposition_table.to_hash_string}
+ }
+
+ KOMPATIBLE_TABLE = {
+ #{kompatible_table.to_hash_string}
+ }
+
+ COMPOSITION_TABLE = {
+ #{composition_table.to_hash_string}
+ }
+end
+MAPPING_TABLE_FILE_END