summaryrefslogtreecommitdiff
path: root/tool/unicode_norm_gen.rb
blob: 766be26dc405e2c3d13a8940f5b398de2e619bd9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# coding: utf-8

# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...

# Constants for input and ouput directory
InputDataDir = $input || 'enc/unicode/data'
OuputDataDir = $ouput || 'lib/unicode_normalize'

# convenience methods
class Integer
  def to_UTF8() # convert to string, taking legibility into account
    if self>0xFFFF
      "\\u{#{to_s(16).upcase}}"
    elsif self>0x7f
      "\\u#{to_s(16).upcase.rjust(4, '0')}"
    else
      chr.sub(/[\\\"]/, "\\\\\\\&")
    end
  end
end

class Array
  def line_slice(new_line) # joins items, 8 items per line
    ary = []
    0.step(size-1, 8) {|i|
      ary << self[i, 8].join('')
    }
    ary.join(new_line).gsub(/ +$/, '')
  end

  def to_UTF8() collect {|c| c.to_UTF8}.join('') end

  def to_regexp_chars # converts an array of Integers to character ranges
    sort.inject([]) do |ranges, value|
      if ranges.last and ranges.last[1]+1>=value
        ranges.last[1] = value
        ranges
      else
        ranges << [value, value]
      end
    end.collect do |first, last|
      case last-first
      when 0
        first.to_UTF8
      when 1
        first.to_UTF8 + last.to_UTF8
      else
        first.to_UTF8 + '-' + last.to_UTF8
      end
    end.line_slice "\" \\\n    \""
  end
end

class Hash
  def to_hash_string
    collect do |key, value|
      "\"#{key.to_UTF8}\"=>\"#{value.to_UTF8}\".freeze, "
    end.line_slice "\n    "
  end
end

# read the file 'CompositionExclusions.txt'
composition_exclusions = File.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
  f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex}
}

decomposition_table = {}
kompatible_table = {}
CombiningClass = {}  # constant to allow use in Integer#to_UTF8

# read the file 'UnicodeData.txt'
IO.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
  codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")

  case decomposition
  when /^[0-9A-F]/
    decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
  when /^</
    kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
  end
  CombiningClass[codepoint.hex] = char_class.to_i if char_class != "0"

  if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
    warn "Unexpected: Character range with data relevant to normalization!"
  end
end

# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
  composition_exclusions.member? character or # predefined composition exclusion
    decomposition.length<=1 or                # Singleton Decomposition
    CombiningClass[character] or              # character is not a Starter
    CombiningClass[decomposition.first]       # decomposition begins with a character that is not a Starter
end.invert

# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values

accent_array = CombiningClass.keys + composition_table.keys.collect {|key| key.last}

composition_starters = composition_table.keys.collect {|key| key.first}

hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}

# expand decomposition table values
decomposition_table.each do |key, value|
  position = 0
  while position < value.length
    if decomposition = decomposition_table[value[position]]
      decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
      value[position, 1] = decomposition
    else
      position += 1
    end
  end
end

# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
  value = value.dup
  expanded = false
  position = 0
  while position < value.length
    if decomposition = kompatible_table[value[position]]
      value[position, 1] = decomposition
      expanded = true
    else
      position += 1
    end
  end
  kompatible_table[key] = value if expanded
end

class_table_str = CombiningClass.collect do |key, value|
  "\"#{key.to_UTF8}\"=>#{value}, "
end.line_slice "\n    "

# generate normalization tables file
open("#{OuputDataDir}/tables.rb", "w").print <<MAPPING_TABLE_FILE_END
# coding: us-ascii

# automatically generated by tool/unicode_norm_gen.rb

module UnicodeNormalize
  accents = "" \\
    "[#{accent_array.to_regexp_chars}]" \\
  "".freeze
  ACCENTS = accents
  REGEXP_D_STRING = "\#{''  # composition starters and composition exclusions
    }" \\
    "[#{(composition_table.values+composition_exclusions).to_regexp_chars}]\#{accents}*" \\
    "|\#{''  # characters that can be the result of a composition, except composition starters
    }" \\
    "[#{(composition_starters-composition_table.values).to_regexp_chars}]?\#{accents}+" \\
    "|\#{''  # precomposed Hangul syllables
    }" \\
    "[\\u{AC00}-\\u{D7A4}]" \\
  "".freeze
  REGEXP_C_STRING = "\#{''  # composition exclusions
    }" \\
    "[#{composition_exclusions.to_regexp_chars}]\#{accents}*" \\
    "|\#{''  # composition starters and characters that can be the result of a composition
    }" \\
    "[#{(composition_starters+composition_table.values).to_regexp_chars}]?\#{accents}+" \\
    "|\#{''  # Hangul syllables with separate trailer
    }" \\
    "[#{hangul_no_trailing.to_regexp_chars}][\\u11A8-\\u11C2]" \\
    "|\#{''  # decomposed Hangul syllables
    }" \\
    "[\\u1100-\\u1112][\\u1161-\\u1175][\\u11A8-\\u11C2]?" \\
  "".freeze
  REGEXP_K_STRING = "" \\
    "[#{kompatible_table.keys.to_regexp_chars}]" \\
  "".freeze

  class_table = {
    #{class_table_str}
  }
  class_table.default = 0
  CLASS_TABLE = class_table.freeze

  DECOMPOSITION_TABLE = {
    #{decomposition_table.to_hash_string}
  }.freeze

  KOMPATIBLE_TABLE = {
    #{kompatible_table.to_hash_string}
  }.freeze

  COMPOSITION_TABLE = {
    #{composition_table.to_hash_string}
  }.freeze
end
MAPPING_TABLE_FILE_END