summaryrefslogtreecommitdiff
path: root/tool/enc-unicode.rb
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-08-25 16:15:38 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-08-25 16:15:38 +0000
commitf1eff957452f55c7382beee92fd11b979a61e101 (patch)
tree0a253b18d9422dbe2f45a23ac14c92ed1dc1b472 /tool/enc-unicode.rb
parent649f648ec1ebc166dc37aeed7c0be50e8bfd85d2 (diff)
Update Oniguruma's UnicodeData to 5.1.
* tool/enc-unicode.rb: added for generate name2ctype.kwd. contributed by Run Paint Run Run [ruby-core:24775] use like following: ruby19 tool/enc-unicode.rb enc/unicode/UnicodeData.txt \ enc/unicode/Scripts.txt > enc/unicode/name2ctype.kwd * enc/unicode.c (CodeRanges): move definitions to name2ctype.h. * enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: updated to v5.1. * enc/unicode/UnicodeData.txt, enc/unicode/Scripts.txt: added v5.1. * Makefile.in: add rule to generate name2ctype.kwd from UnicodeData.txt and Scripts.txt. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24651 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool/enc-unicode.rb')
-rwxr-xr-xtool/enc-unicode.rb230
1 files changed, 230 insertions, 0 deletions
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb
new file mode 100755
index 0000000..b9f19c9
--- /dev/null
+++ b/tool/enc-unicode.rb
@@ -0,0 +1,230 @@
+#!/usr/bin/env ruby
+
+# Creates the data structures needed by Onigurma to map Unicode codepoints to
+# property names and POSIX character classes
+
+unless ARGV.size == 2
+ $stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
+ exit(1)
+end
+
+POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII]
+
+def pair_codepoints(codepoints)
+
+ # We have a sorted Array of codepoints that we wish to partition into
+ # ranges such that the start- and endpoints form an inclusive set of
+ # codepoints with property _property_. Note: It is intended that some ranges
+ # will begin with the value with which they end, e.g. 0x0020 -> 0x0020
+
+ codepoints = codepoints.uniq.sort
+ last_cp = codepoints.first
+ pairs = [[last_cp, nil]]
+ codepoints[1..-1].each do |codepoint|
+
+ # If the current codepoint does not follow directly on from the last
+ # codepoint, the last codepoint represents the end of the current range,
+ # and the current codepoint represents the start of the next range.
+ if last_cp.next != codepoint
+ pairs[-1][-1] = last_cp
+ pairs << [codepoint, nil]
+ end
+ last_cp = codepoint
+ end
+
+ # The final pair has as its endpoint the last codepoint for this property
+ pairs[-1][-1] = codepoints.last
+ pairs
+end
+
+def parse_unicode_data(file)
+ last_cp = 0
+ data = {'Cn' => []}
+ IO.foreach(file) do |line|
+ fields = line.split(';')
+ cp = fields[0].to_i(16)
+
+ # The Cn category represents unassigned characters. These are not listed in
+ # UnicodeData.txt so we must derive them by looking for 'holes' in the range
+ # of listed codepoints. We increment the last codepoint seen and compare it
+ # with the current codepoint. If the current codepoint is less than
+ # last_cp.next we have found a hole, so we add the missing codepoint to the
+ # Cn category.
+ while ((last_cp = last_cp.next) < cp)
+ data['Cn'] << last_cp
+ end
+
+ # The third field denotes the 'General' category, e.g. Lu
+ (data[fields[2]] ||= []) << cp
+
+ # The 'Major' category is the first letter of the 'General' category, e.g.
+ # 'Lu' -> 'L'
+ (data[fields[2][0,1]] ||= []) << cp
+ last_cp = cp
+ end
+
+ # General Category property
+ gcps = %w[Any Assigned]
+ gcps.concat data.keys.sort
+
+ # The last Cn codepoint should be 0x10ffff. If it's not, append the missing
+ # codepoints to Cn and C
+ cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
+ data['Cn'] += cn_remainder
+ data['C'] += cn_remainder
+
+ # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
+ #
+
+ # alnum Letter | Mark | Decimal_Number
+ data['Alnum'] = data['L'] + data['M'] + data['Nd']
+
+ # alpha Letter | Mark
+ data['Alpha'] = data['L'] + data['M']
+
+ # ascii 0000 - 007F
+ data['ASCII'] = (0..0x007F).to_a
+
+ # blank Space_Separator | 0009
+ data['Blank'] = data['Zs'] + [0x0009]
+
+ # cntrl Control
+ data['Cntrl'] = data['Cc']
+
+ # digit Decimal_Number
+ data['Digit'] = data['Nd']
+
+ # lower Lowercase_Letter
+ data['Lower'] = data['Ll']
+
+ # punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation |
+ # Final_Punctuation | Initial_Punctuation | Other_Punctuation |
+ # Open_Punctuation
+ # NOTE: This definition encompasses the entire P category, and the current
+ # mappings agree, but we explcitly declare this way to marry it with the above
+ # definition.
+ data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] +
+ data['Pi'] + data['Po'] + data['Ps']
+
+ # space Space_Separator | Line_Separator | Paragraph_Separator |
+ # 0009 | 000A | 000B | 000C | 000D | 0085
+ data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] +
+ [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085]
+
+ # upper Uppercase_Letter
+ data['Upper'] = data['Lu']
+
+ # xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066
+ # (0-9, a-f, A-F)
+ data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a +
+ (0x0061..0x0066).to_a
+
+ # word Letter | Mark | Decimal_Number | Connector_Punctuation
+ data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc']
+
+ # graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate
+ data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S']
+ data['Graph'] -= data['Space'] - data['C']
+
+ # print [[:graph:]] | [[:space:]]
+ data['Print'] = data['Graph'] + data['Space']
+
+ # NEWLINE - This was defined in unicode.c
+ data['NEWLINE'] = [0x000a]
+
+ # Any - Defined in unicode.c
+ data['Any'] = (0x0000..0x10ffff).to_a
+
+ # Assigned - Defined in unicode.c; interpreted as every character in the
+ # Unicode range minus the unassigned characters
+ data['Assigned'] = data['Any'] - data['Cn']
+
+ # Returns General Category Property names and the data
+ [gcps, data]
+end
+
+
+def parse_scripts(file)
+ script = nil
+ data = []
+ names = []
+ IO.foreach(file) do |line|
+ if /^# Total code points: / =~ line
+ make_const(script, pair_codepoints(data), 'Script')
+ names << script
+ data = []
+ elsif /^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?\s*;\s*(\w+)/ =~ line
+ script = $3
+ $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16))
+ end
+ end
+ names
+end
+
+# make_const(property, pairs, name): Prints a 'static const' structure for a
+# given property, group of paired codepoints, and a human-friendly name for
+# the group
+def make_const(prop, pairs, name)
+ puts "\n/* '#{prop}': #{name} */"
+ puts "static const OnigCodePoint CR_#{prop}[] = {"
+ # The first element of the constant is the number of pairs of codepoints
+ puts "\t#{pairs.size},"
+ pairs.each do |pair|
+ pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) }
+ puts "\t#{pair.first}, #{pair.last},"
+ end
+ puts "}; /* CR_#{prop} */"
+end
+
+puts '%{'
+gcps, data = parse_unicode_data(ARGV[0])
+POSIX_NAMES.each do |name|
+ make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]")
+end
+print "\n#ifdef USE_UNICODE_PROPERTIES"
+gcps.each do |name|
+ category =
+ case name.size
+ when 1 then 'Major Category'
+ when 2 then 'General Category'
+ else '-'
+ end
+ make_const(name, pair_codepoints(data[name]), category)
+end
+scripts = parse_scripts(ARGV[1])
+puts "#endif /* USE_UNICODE_PROPERTIES */"
+
+puts "\n\nstatic const OnigCodePoint* const CodeRanges[] = {"
+POSIX_NAMES.each{|name|puts" CR_#{name},"}
+puts "#ifdef USE_UNICODE_PROPERTIES"
+gcps.each{|name|puts" CR_#{name},"}
+scripts.each{|name|puts" CR_#{name},"}
+puts "#endif /* USE_UNICODE_PROPERTIES */"
+puts "};"
+
+puts(<<'__HEREDOC')
+struct uniname2ctype_struct {
+ int name, ctype;
+};
+
+static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int);
+%}
+struct uniname2ctype_struct;
+%%
+__HEREDOC
+i = -1
+POSIX_NAMES.each {|name|puts"%-21s %3d"%[name+',', i+=1]}
+puts "#ifdef USE_UNICODE_PROPERTIES"
+gcps.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
+scripts.each{|name|puts"%-21s %3d"%[name+',', i+=1]}
+puts "#endif /* USE_UNICODE_PROPERTIES */\n"
+puts(<<'__HEREDOC')
+%%
+static int
+uniname2ctype(const UChar *name, unsigned int len)
+{
+ const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len);
+ if (p) return p->ctype;
+ return -1;
+}
+__HEREDOC