#!/usr/bin/ruby # Usage: # $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt # $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h class CaseFolding module Util module_function def hex_seq(v) v.map {|i| "0x%04x" % i}.join(", ") end def print_table_1(dest, data) for k, v in data = data.sort sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k) dest.print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n") end data end def print_table(dest, type, data) dest.print("static const #{type}_Type #{type}_Table[] = {\n") i = 0 ret = data.inject([]) do |a, (n, d)| dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n") i += d.size a.concat(print_table_1(dest, d)) end dest.print("};\n\n") ret end end include Util attr_reader :fold, :fold_locale, :unfold, :unfold_locale def load(filename) pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/ @fold = fold = {} @unfold = unfold = [{}, {}, {}] turkic = [] IO.foreach(filename) do |line| next unless res = pattern.match(line) ch_from = res[1].to_i(16) if res[2] == 'T' # Turkic case folding turkic << ch_from next end # store folding data ch_to = res[3..6].inject([]) do |a, i| break a unless i a << i.to_i(16) end fold[ch_from] = ch_to # store unfolding data i = ch_to.length - 1 (unfold[i][ch_to] ||= []) << ch_from end # move locale dependent data to (un)fold_locale @fold_locale = fold_locale = {} @unfold_locale = unfold_locale = [{}, {}] for ch_from in turkic key = fold[ch_from] i = key.length - 1 unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key) fold_locale[ch_from] = fold.delete(ch_from) end self end def lookup_hash(key, type, data) hash = "onigenc_unicode_#{key}_hash" lookup = "onigenc_unicode_#{key}_lookup" gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}" argname = "code" argdecl = "const OnigCodePoint #{argname}" n = 7 m = (1 << n) - 1 min, max = data.map {|c, *|c}.minmax src = IO.popen(gperf, "r+") {|f| f << "short\n%%\n" data.each_with_index {|(k, _), i| ks = [(k >> n*2) & m, (k >> n) & m, (k) & m].map {|c| "\\x%.2x" % c}.join("") f.printf "\"%s\", ::::/*0x%.4x*/ %d\n", ks, k, i } f << "%%\n" f.close_write f.read } src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { name = $1 body = $2 body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)") "#{name}(#{argdecl})\n{\n#{body}}" } src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) { name = $1 body = $2 body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1") body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)') body.gsub!(/\{"",-1}/, "-1") body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1') body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"} v = nil body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) { pre = $1 indent = $2 s = $3 s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]') v = $1 s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))") "#{pre}{#{s}\n#{indent}}" } body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;") "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}" } src end def display(dest) # print the header dest.print("/* DO NOT EDIT THIS FILE. */\n") dest.print("/* Generated by enc/unicode/case-folding.rb */\n\n") # print folding data # CaseFold + CaseFold_Locale name = "CaseFold_11" data = print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale) dest.print lookup_hash(name, "CodePointList3", data) # print unfolding data # CaseUnfold_11 + CaseUnfold_11_Locale name = "CaseUnfold_11" print_table(dest, name, name=>unfold[0], "#{name}_Locale"=>unfold_locale[0]) # CaseUnfold_12 + CaseUnfold_12_Locale name = "CaseUnfold_12" print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1]) # CaseUnfold_13 name = "CaseUnfold_13" print_table(dest, name, name=>unfold[2]) # table sizes unfold1_table_size = unfold[0].size + unfold_locale[0].size dest.printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2)) unfold2_table_size = unfold[1].size + unfold_locale[1].size dest.printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5)) unfold3_table_size = unfold[2].size dest.printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7)) end def self.load(*args) new.load(*args) end end if $0 == __FILE__ require 'optparse' dest = nil fold_1 = false ARGV.options do |opt| opt.banner << " [INPUT]" opt.on("--output-file=FILE", "-o", "output to the FILE instead of STDOUT") {|output| dest = (output unless output == '-') } opt.parse! abort(opt.to_s) if ARGV.size > 1 end filename = ARGV[0] || 'CaseFolding.txt' data = CaseFolding.load(filename) if dest open(dest, "wb") do |f| data.display(f) end else data.display(STDOUT) end end