# coding: utf-8 # Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp) # Script to generate Ruby data structures used in implementing # String#unicode_normalize,... # Constants for input and ouput directory InputDataDir = $input || 'enc/unicode/data' OuputDataDir = $ouput || 'lib/unicode_normalize' # convenience methods class Integer def to_UTF8() # convert to string, taking legibility into account if self>0xFFFF "\\u{#{to_s(16).upcase}}" elsif self>0x7f "\\u#{to_s(16).upcase.rjust(4, '0')}" else chr.sub(/[\\\"]/, '\\\&') end end end class Array def line_slice(new_line) # joins items, 8 items per line ary = [] 0.step(size-1, 8) {|i| ary << self[i, 8].join('') } ary.join(new_line).gsub(/ +$/, '') end def to_UTF8() collect {|c| c.to_UTF8}.join('') end def to_regexp_chars # converts an array of Integers to character ranges sort.inject([]) do |ranges, value| if ranges.last and ranges.last[1]+1>=value ranges.last[1] = value ranges else ranges << [value, value] end end.collect do |first, last| case last-first when 0 first.to_UTF8 when 1 first.to_UTF8 + last.to_UTF8 else first.to_UTF8 + '-' + last.to_UTF8 end end.line_slice "\" \\\n \"" end end class Hash def to_hash_string collect do |key, value| "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\".freeze, " end.line_slice "\n " end end # read the file 'CompositionExclusions.txt' composition_exclusions = File.open("#{InputDataDir}/CompositionExclusions.txt") {|f| f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex} } decomposition_table = {} kompatible_table = {} CombiningClass = {} # constant to allow use in Integer#to_UTF8 # read the file 'UnicodeData.txt' IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line| codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";") case decomposition when /^[0-9A-F]/ decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex} when /^$/ and (char_class!="0" or decomposition!="") warn "Unexpected: Character range with data relevant to normalization!" end end # calculate compositions from decompositions composition_table = decomposition_table.reject do |character, decomposition| composition_exclusions.member? character or # predefined composition exclusion decomposition.length<=1 or # Singleton Decomposition CombiningClass[character] or # character is not a Starter CombiningClass[decomposition.first] # decomposition begins with a character that is not a Starter end.invert # recalculate composition_exclusions composition_exclusions = decomposition_table.keys - composition_table.values accent_array = CombiningClass.keys + composition_table.keys.collect {|key| key.last} composition_starters = composition_table.keys.collect {|key| key.first} hangul_no_trailing = [] 0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c} # expand decomposition table values decomposition_table.each do |key, value| position = 0 while position < value.length if decomposition = decomposition_table[value[position]] decomposition_table[key] = value = value.dup # avoid overwriting composition_table key value[position, 1] = decomposition else position += 1 end end end # deal with relationship between canonical and kompatibility decompositions decomposition_table.each do |key, value| value = value.dup expanded = false position = 0 while position < value.length if decomposition = kompatible_table[value[position]] value[position, 1] = decomposition expanded = true else position += 1 end end kompatible_table[key] = value if expanded end class_table_str = CombiningClass.collect do |key, value| "\"#{key.to_UTF8}\"=>#{value}, " end.line_slice "\n " # generate normalization tables file open("#{OuputDataDir}/tables.rb", "w").print <