summaryrefslogtreecommitdiff
path: root/template/unicode_norm_gen.tmpl
blob: f83fc2d0cd5d4e44bef898fda82f7ecb9075eebc (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
%# -*- mode: ruby; coding: utf-8 -*-
<%
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...

# Constants for input and output directory
InputDataDir = ARGV[0] || 'enc/unicode/data'
unicode_version = InputDataDir[/.*\/(\d+\.\d+\.\d+)(?=\/|\z)/, 1]

# convenience methods
class Integer
  def to_UTF8() # convert to string, taking legibility into account
    if self>0xFFFF
      "\\u{#{to_s(16).upcase}}"
    elsif self>0x7f
      "\\u#{to_s(16).upcase.rjust(4, '0')}"
    else
      chr.sub(/[\\\"]/, "\\\\\\\&")
    end
  end
end

module Enumerable
  unless method_defined?(:each_slice)
    def each_slice(n)
      ary = []
      each do |i|
        ary << i
        if ary.size >= n
          yield ary
          ary = []
        end
      end
      yield ary unless ary.empty?
      self
    end
  end
end

class Array
  def to_UTF8() collect {|c| c.to_UTF8}.join('') end

  def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
    sort.inject([]) do |ranges, value|
      if ranges.last and ranges.last[1]+1>=value
        ranges.last[1] = value
        ranges
      else
        ranges << [value, value]
      end
    end.collect do |first, last|
      case last-first
      when 0
        first.to_UTF8
      when 1
        first.to_UTF8 + last.to_UTF8
      else
        first.to_UTF8 + '-' + last.to_UTF8
      end
    end.each_slice(n) do |slice|
      yield slice.join('')
    end
  end
end

# read the file 'CompositionExclusions.txt'
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt", 'rb') {|f|
  base = Regexp.quote(File.basename(f.path, '.*'))
  ext = Regexp.quote(File.extname(f.path))
  version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
    abort "No file version in #{f.path}: #{line}"
  (unicode_version ||= version) == version or
    abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
  f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
}

decomposition_table = {}
kompatible_table = {}
combining_class = {}  # constant to allow use in Integer#to_UTF8

# read the file 'UnicodeData.txt'
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
  codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")

  case decomposition
  when /^[0-9A-F]/
    decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
  when /^</
    kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
  end
  combining_class[codepoint.hex] = char_class.to_i if char_class != "0"

  if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
    warn "Unexpected: Character range with data relevant to normalization!"
  end
end

# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
  composition_exclusions.member? character or # predefined composition exclusion
    decomposition.length<=1 or                # Singleton Decomposition
    combining_class[character] or             # character is not a Starter
    combining_class[decomposition.first]      # decomposition begins with a character that is not a Starter
end.invert

# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values

accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}

composition_starters = composition_table.keys.collect {|key| key.first}

hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}

# expand decomposition table values
decomposition_table.each do |key, value|
  position = 0
  while position < value.length
    if decomposition = decomposition_table[value[position]]
      decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
      value[position, 1] = decomposition
    else
      position += 1
    end
  end
end

# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
  value = value.dup
  expanded = false
  position = 0
  while position < value.length
    if decomposition = kompatible_table[value[position]]
      value[position, 1] = decomposition
      expanded = true
    else
      position += 1
    end
  end
  kompatible_table[key] = value if expanded
end

while kompatible_table.any? {|key, value|
        expanded = value.map {|v| kompatible_table[v] || v}.flatten
        kompatible_table[key] = expanded unless value == expanded
      }
end

# generate normalization tables file
%># coding: us-ascii
# frozen_string_literal: true
%# >

# automatically generated by template/unicode_norm_gen.tmpl

module UnicodeNormalize  # :nodoc:
  accents = "" \
    "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]"
  ACCENTS = accents
  REGEXP_D_STRING = "#{''  # composition starters and composition exclusions
    }" \
    "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]#{accents}*" \
    "|#{''  # characters that can be the result of a composition, except composition starters
    }" \
    "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]?#{accents}+" \
    "|#{''  # precomposed Hangul syllables
    }" \
    "[\u{AC00}-\u{D7A4}]"
  REGEXP_C_STRING = "#{''  # composition exclusions
    }" \
    "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]#{accents}*" \
    "|#{''  # composition starters and characters that can be the result of a composition
    }" \
    "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]?#{accents}+" \
    "|#{''  # Hangul syllables with separate trailer
    }" \
    "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>][\u11A8-\u11C2]" \
    "|#{''  # decomposed Hangul syllables
    }" \
    "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?"
  REGEXP_K_STRING = "" \
    "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
    "<%end%>]"

  class_table = {
% combining_class.each_slice(8) do |slice|
   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
% end
  }
  class_table.default = 0
  CLASS_TABLE = class_table.freeze

  DECOMPOSITION_TABLE = {
% decomposition_table.each_slice(8) do |slice|
   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
% end
  }.freeze

  KOMPATIBLE_TABLE = {
% kompatible_table.each_slice(8) do |slice|
   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
% end
  }.freeze

  COMPOSITION_TABLE = {
% composition_table.each_slice(8) do |slice|
   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>"<%=%>,<% end%>
% end
  }.freeze
end