#!/usr/bin/env ruby # Creates the data structures needed by Onigurma to map Unicode codepoints to # property names and POSIX character classes # # To use this, get UnicodeData.txt, Scripts.txt, PropList.txt from unicode.org. # (http://unicode.org/Public/UNIDATA/) # And run following command. # ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd # You can get source file for gperf. # After this, simply make ruby. unless ARGV.size == 1 $stderr.puts "Usage: #{$0} data_directory" exit(1) end POSIX_NAMES = %w[NEWLINE Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII] def pair_codepoints(codepoints) # We have a sorted Array of codepoints that we wish to partition into # ranges such that the start- and endpoints form an inclusive set of # codepoints with property _property_. Note: It is intended that some ranges # will begin with the value with which they end, e.g. 0x0020 -> 0x0020 codepoints.sort! last_cp = codepoints.first pairs = [[last_cp, nil]] codepoints[1..-1].each do |codepoint| next if last_cp == codepoint # If the current codepoint does not follow directly on from the last # codepoint, the last codepoint represents the end of the current range, # and the current codepoint represents the start of the next range. if last_cp.next != codepoint pairs[-1][-1] = last_cp pairs << [codepoint, nil] end last_cp = codepoint end # The final pair has as its endpoint the last codepoint for this property pairs[-1][-1] = codepoints.last pairs end def parse_unicode_data(file) last_cp = 0 data = {'Any' => [], 'Assigned' => [], 'Cn' => []} beg_cp = nil IO.foreach(file) do |line| fields = line.split(';') cp = fields[0].to_i(16) case fields[1] when /\A<(.*),\s*First>\z/ beg_cp = cp next when /\A<(.*),\s*Last>\z/ cps = (beg_cp..cp).to_a else beg_cp = cp cps = [cp] end # The Cn category represents unassigned characters. These are not listed in # UnicodeData.txt so we must derive them by looking for 'holes' in the range # of listed codepoints. We increment the last codepoint seen and compare it # with the current codepoint. If the current codepoint is less than # last_cp.next we have found a hole, so we add the missing codepoint to the # Cn category. data['Cn'].concat((last_cp.next...beg_cp).to_a) # Assigned - Defined in unicode.c; interpreted as every character in the # Unicode range minus the unassigned characters data['Assigned'].concat(cps) # The third field denotes the 'General' category, e.g. Lu (data[fields[2]] ||= []).concat(cps) # The 'Major' category is the first letter of the 'General' category, e.g. # 'Lu' -> 'L' (data[fields[2][0,1]] ||= []).concat(cps) last_cp = cp end # The last Cn codepoint should be 0x10ffff. If it's not, append the missing # codepoints to Cn and C cn_remainder = (last_cp.next..0x10ffff).to_a data['Cn'] += cn_remainder data['C'] += cn_remainder # Define General Category properties gcps = data.keys.sort # We now derive the character classes (POSIX brackets), e.g. [[:alpha:]] # # alnum Letter | Mark | Decimal_Number data['Alnum'] = data['L'] + data['M'] + data['Nd'] # alpha Letter | Mark data['Alpha'] = data['L'] + data['M'] # ascii 0000 - 007F data['ASCII'] = (0..0x007F).to_a # blank Space_Separator | 0009 data['Blank'] = data['Zs'] + [0x0009] # cntrl Control data['Cntrl'] = data['Cc'] # digit Decimal_Number data['Digit'] = data['Nd'] # lower Lowercase_Letter data['Lower'] = data['Ll'] # punct Connector_Punctuation | Dash_Punctuation | Close_Punctuation | # Final_Punctuation | Initial_Punctuation | Other_Punctuation | # Open_Punctuation # NOTE: This definition encompasses the entire P category, and the current # mappings agree, but we explcitly declare this way to marry it with the above # definition. data['Punct'] = data['Pc'] + data['Pd'] + data['Pe'] + data['Pf'] + data['Pi'] + data['Po'] + data['Ps'] # space Space_Separator | Line_Separator | Paragraph_Separator | # 0009 | 000A | 000B | 000C | 000D | 0085 data['Space'] = data['Zs'] + data['Zl'] + data['Zp'] + [0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0085] # upper Uppercase_Letter data['Upper'] = data['Lu'] # xdigit 0030 - 0039 | 0041 - 0046 | 0061 - 0066 # (0-9, a-f, A-F) data['XDigit'] = (0x0030..0x0039).to_a + (0x0041..0x0046).to_a + (0x0061..0x0066).to_a # word Letter | Mark | Decimal_Number | Connector_Punctuation data['Word'] = data['L'] + data['M'] + data['Nd'] + data['Pc'] # graph [[:^space:]] && ^Control && ^Unassigned && ^Surrogate data['Graph'] = data['L'] + data['M'] + data['N'] + data['P'] + data['S'] data['Graph'] -= data['Space'] - data['C'] # print [[:graph:]] | [[:space:]] data['Print'] = data['Graph'] + data['Space'] # NEWLINE - This was defined in unicode.c data['NEWLINE'] = [0x000a] # Any - Defined in unicode.c data['Any'] = (0x0000..0x10ffff).to_a # Returns General Category Property names and the data [gcps, data] end def parse_scripts files = [ {fn: 'DerivedCoreProperties.txt', title: 'Derived Property'}, {fn: 'Scripts.txt', title: 'Script'}, {fn: 'PropList.txt', title: 'Binary Property'} ] current = nil data = [] names = [] files.each do |file| IO.foreach(get_file(file[:fn])) do |line| if /^# Total code points: / =~ line make_const(current, pair_codepoints(data), file[:title]) names << current data = [] elsif /^(\h+)(?:..(\h+))?\s*;\s*(\w+)/ =~ line current = $3 $2 ? data.concat(($1.to_i(16)..$2.to_i(16)).to_a) : data.push($1.to_i(16)) end end end names end def parse_aliases kv = {} IO.foreach(get_file('PropertyAliases.txt')) do |line| next unless /^(\w+)\s*; (\w+)/ =~ line kv[normalize_propname($1)] = normalize_propname($2) end IO.foreach(get_file('PropertyValueAliases.txt')) do |line| next unless /^(sc|gc)\s*; (\w+)\s*; (\w+)(?:\s*; (\w+))?/ =~ line if $1 == 'gc' kv[normalize_propname($3)] = normalize_propname($2) kv[normalize_propname($4)] = normalize_propname($2) if $4 else kv[normalize_propname($2)] = normalize_propname($3) kv[normalize_propname($4)] = normalize_propname($3) if $4 end end kv end # make_const(property, pairs, name): Prints a 'static const' structure for a # given property, group of paired codepoints, and a human-friendly name for # the group def make_const(prop, pairs, name) puts "\n/* '#{prop}': #{name} */" puts "static const OnigCodePoint CR_#{prop}[] = {" # The first element of the constant is the number of pairs of codepoints puts "\t#{pairs.size}," pairs.each do |pair| pair.map! { |c| c == 0 ? '0x0000' : sprintf("%0#6x", c) } puts "\t#{pair.first}, #{pair.last}," end puts "}; /* CR_#{prop} */" end def normalize_propname(name) name = name.downcase name.delete!('- _') name end def get_file(name) File.join(ARGV[0], name) end # Write Data puts '%{' props, data = parse_unicode_data(get_file('UnicodeData.txt')) POSIX_NAMES.each do |name| make_const(name, pair_codepoints(data[name]), "[[:#{name}:]]") end print "\n#ifdef USE_UNICODE_PROPERTIES" props.each do |name| category = case name.size when 1 then 'Major Category' when 2 then 'General Category' else '-' end make_const(name, pair_codepoints(data[name]), category) end props.concat parse_scripts puts(<<'__HEREDOC') #endif /* USE_UNICODE_PROPERTIES */ static const OnigCodePoint* const CodeRanges[] = { __HEREDOC POSIX_NAMES.each{|name|puts" CR_#{name},"} puts "#ifdef USE_UNICODE_PROPERTIES" props.each{|name|puts" CR_#{name},"} puts(<<'__HEREDOC') #endif /* USE_UNICODE_PROPERTIES */ }; struct uniname2ctype_struct { int name, ctype; }; static const struct uniname2ctype_struct *uniname2ctype_p(const char *, unsigned int); %} struct uniname2ctype_struct; %% __HEREDOC i = -1 name_to_index = {} POSIX_NAMES.each do |name| i += 1 name = normalize_propname(name) name_to_index[name] = i puts"%-40s %3d" % [name + ',', i] end puts "#ifdef USE_UNICODE_PROPERTIES" props.each do |name| i += 1 name = normalize_propname(name) name_to_index[name] = i puts "%-40s %3d" % [name + ',', i] end parse_aliases.each_pair do |k, v| next if name_to_index[k] next unless v = name_to_index[v] puts "%-40s %3d" % [k + ',', v] end puts(<<'__HEREDOC') #endif /* USE_UNICODE_PROPERTIES */ %% static int uniname2ctype(const UChar *name, unsigned int len) { const struct uniname2ctype_struct *p = uniname2ctype_p((const char *)name, len); if (p) return p->ctype; return -1; } __HEREDOC