tool/enc-unicode.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253

#!/usr/bin/env ruby

# Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes
#
# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
# (http://unicode.org/Public/UNIDATA/)
# And run following command.
#   ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
# You can get source file for gperf.
# After this, simply make ruby.

unless ARGV.size == 2
  $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
  exit(1)
end

POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]

def pair_codepoints(codepoints)

  # We have a sorted Array of codepoints that we wish to partition into
  # ranges such that the start- and endpoints form an inclusive set of
  # codepoints with property _property_. Note: It is intended that some ranges
  # will begin with the value with  which they end, e.g. 0x0020 -> 0x0020

  codepoints.sort!
  last_cp = codepoints.first
  pairs = [[last_cp, nil]]
  codepoints[1..-1].each do |codepoint|
    next if last_cp == codepoint

    # If the current codepoint does not follow directly on from the last
    # codepoint, the last codepoint represents the end of the current range,
    # and the current codepoint represents the start of the next range.
    if last_cp.next != codepoint
      pairs[-1][-1] = last_cp
      pairs << [codepoint, nil]
    end
    last_cp = codepoint
  end

  # The final pair has as its endpoint the last codepoint for this property
  pairs[-1][-1] = codepoints.last
  pairs
end

def parse_unicode_data(file)
  last_cp = 0
  data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
  beg_cp = nil
  IO.foreach(file) do |line|
    fields = line.split(';')
    cp = fields[0].to_i(16)

    case fields[1]
    when /\A<(.*),\s*First>\z/
      beg_cp = cp
      next
    when /\A<(.*),\s*Last>\z/
      cps = (beg_cp..cp).to_a
    else
      beg_cp = cp
      cps = [cp]
    end

    # The Cn category represents unassigned characters. These are not listed in
    # UnicodeData.txt so we must derive them by looking for 'holes' in the range
    # of listed codepoints. We increment the last codepoint seen and compare it
    # with the current codepoint. If the current codepoint is less than
    # last_cp.next we have found a hole, so we add the missing codepoint to the
    # Cn category.
    data['Cn'].concat((last_cp.next...beg_cp).to_a)

    # Assigned - Defined in unicode.c; interpreted as every character in the
    # Unicode range minus the unassigned characters
    data['Assigned'].concat(cps)

    # The third field denotes the 'General' category, e.g. Lu
    (data[fields[2]] ||= []).concat(cps)

    # The 'Major' category is the first letter of the 'General' category, e.g.
    # 'Lu' -> 'L'
    (data[fields[2][0,1]] ||= []).concat(cps)
    last_cp = cp
  end

  # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
  # codepoints to Cn and C
  cn_remainder = (last_cp.next..0x10ffff).to_a
  data['Cn'] += cn_remainder
  data['C'] += cn_remainder

  # Define General Category properties
  gcps = data.keys.sort

  # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
  #

  # alnum    Letter | Mark | Decimal_Number
  data['Alnum'] = data['L'] + data['M'] + data['Nd']

  # alpha    Letter | Mark
  data['Alpha'] = data['L'] + data['M']

  # ascii    0000 - 007F
  data['ASCII'] = (0..0x007F).to_a

  # blank    Space_Separator | 0009
  data['Blank'] = data['Zs'] + [0x0009]

  # cntrl    Control
  data['Cntrl'] = data['Cc']

  # digit    Decimal_Number
  data['Digit'] = data['Nd']

  # lower    Lowercase_Letter
  data['Lower'] = data['Ll']

  # punct    Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
  #          Final_Punctuation | Initial_Punctuation | Other_Punctuation |
  #          Open_Punctuation
  # NOTE: This definition encompasses the entire P category, and the current
  # mappings agree, but we explcitly declare this way to marry it with the above
  # definition.
  data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
                  data['Pi'] + data['Po'] + data['Ps']

  # space    Space_Separator | Line_Separator | Paragraph_Separator |
  #               0009 | 000A | 000B | 000C | 000D | 0085
  data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
                  [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]

  # upper    Uppercase_Letter
  data['Upper'] = data['Lu']

  # xdigit   0030 - 0039 | 0041 - 0046 | 0061 - 0066
  #          (0-9, a-f, A-F)
  data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
                   (0x0061..0x0066).to_a

  # word     Letter | Mark | Decimal_Number | Connector_Punctuation
  data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']

  # graph    [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
  data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
  data['Graph'] -= data['Space'] - data['C']

  # print    [[:graph:]] | [[:space:]]
  data['Print'] = data['Graph'] + data['Space']

  # NEWLINE - This was defined in unicode.c
  data['NEWLINE'] = [0x000a]

  # Any - Defined in unicode.c
  data['Any'] = (0x0000..0x10ffff).to_a

  # Returns General Category Property names and the data
  [gcps, data]
end


def parse_scripts(file)
  script = nil
  data = []
  names = []
  IO.foreach(file) do |line|
    if /^# Total code points: / =~ line
      make_const(script, pair_codepoints(data), 'Script')
      names << script
      data = []
    elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
      script = $3
      $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
    end
  end
  names
end

# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
def make_const(prop, pairs, name)
  puts "\n/* '#{prop}': #{name} */"
  puts "static const OnigCodePoint CR_#{prop}[] = {"
  # The first element of the constant is the number of pairs of codepoints
  puts "\t#{pairs.size},"
  pairs.each do |pair|
    pair.map! { |c|  c == 0 ? '0x0000' : sprintf("%0#6x", c) }
    puts "\t#{pair.first}, #{pair.last},"
  end
  puts "}; /* CR_#{prop} */"
end

def normalize_propname(name)
  name = name.downcase
  name.gsub!(/[- _]/, '')
  name
end

puts '%{'
gcps, data = parse_unicode_data(ARGV[0])
POSIX_NAMES.each do |name|
  make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
end
print "\n#ifdef USE_UNICODE_PROPERTIES"
gcps.each do |name|
  category =
    case name.size
    when 1 then 'Major Category'
    when 2 then 'General Category'
    else        '-'
    end
  make_const(name, pair_codepoints(data[name]), category)
end
scripts = parse_scripts(ARGV[1])
puts "#endif /* USE_UNICODE_PROPERTIES */"

puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
POSIX_NAMES.each{|name|puts"  CR_#{name},"}
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts"  CR_#{name},"}
scripts.each{|name|puts"  CR_#{name},"}
puts "#endif /* USE_UNICODE_PROPERTIES */"
puts "};"

puts(<<'__HEREDOC')
struct uniname2ctype_struct {
  int name, ctype;
};

static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
%}
struct uniname2ctype_struct;
%%
__HEREDOC
i = -1
POSIX_NAMES.each  {|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
puts "#ifdef USE_UNICODE_PROPERTIES"
gcps.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
scripts.each{|name|puts"%-21s %3d" % [normalize_propname(name)+',', i+=1]}
puts "#endif /* USE_UNICODE_PROPERTIES */\n"
puts(<<'__HEREDOC')
%%
static int
uniname2ctype(const UChar *name, unsigned int len)
{
  const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
  if (p) return p->ctype;
  return -1;
}
__HEREDOC