require 'optparse' require 'erb' C_ESC = { "\\" => "\\\\", '"' => '\"', "\n" => '\n', } 0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch } 0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch } C_ESC_PAT = Regexp.union(*C_ESC.keys) def c_esc(str) '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"' end class StrSet def self.parse(pattern) if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern raise ArgumentError, "invalid pattern: #{pattern.inspect}" end result = [] pattern.scan(/\S+/) {|seq| seq_result = [] while !seq.empty? if /\A([0-9a-f][0-9a-f])/i =~ seq byte = $1.to_i(16) seq_result << [byte..byte] seq = $' elsif /\A\{([^\}]+)\}/ =~ seq set = $1 seq = $' set_result = [] set.scan(/[^,]+/) {|range| if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range b = $1.to_i(16) e = $2.to_i(16) set_result << (b..e) elsif /\A([0-9a-f][0-9a-f])\z/ =~ range byte = $1.to_i(16) set_result << (byte..byte) else raise "invalid range: #{range.inspect}" end } seq_result << set_result else raise "invalid sequence: #{seq.inspect}" end end result << seq_result } self.new(result) end def initialize(pat) @pat = pat end def hash @pat.hash end def eql?(other) self.class == other.class && @pat == other.instance_eval { @pat } end alias == eql? def to_s if @pat.empty? "(empset)" else @pat.map {|seq| if seq.empty? "(empstr)" else seq.map {|byteset| if byteset.length == 1 && byteset[0].begin == byteset[0].end "%02x" % byteset[0].begin else "{" + byteset.map {|range| if range.begin == range.end "%02x" % range.begin else "%02x-%02x" % [range.begin, range.end] end }.join(',') + "}" end }.join('') end }.join(' ') end end def inspect "\#<#{self.class}: #{self.to_s}>" end def emptyable? @pat.any? {|seq| seq.empty? } end def first_bytes result = {} @pat.each {|seq| next if seq.empty? seq.first.each {|range| range.each {|byte| result[byte] = true } } } result.keys.sort end def each_firstbyte h = {} @pat.each {|seq| next if seq.empty? seq.first.each {|range| range.each {|byte| (h[byte] ||= []) << seq[1..-1] } } } h.keys.sort.each {|byte| yield byte, StrSet.new(h[byte]) } end end class ActionMap def self.parse(hash) h = {} hash.each {|pat, action| h[StrSet.parse(pat)] = action } self.new(h) end def initialize(h) @map = h end def hash hash = 0 @map.each {|k,v| hash ^= k.hash ^ v.hash } hash end def eql?(other) self.class == other.class && @map == other.instance_eval { @map } end alias == eql? def inspect "\#<#{self.class}:" + @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') + ">" end def empty_action @map.each {|ss, action| return action if ss.emptyable? } nil end def each_firstbyte(valid_encoding=nil) h = {} @map.each {|ss, action| if ss.emptyable? raise "emptyable pattern" else ss.each_firstbyte {|byte, rest| h[byte] ||= {} if h[byte][rest] raise "ambiguous" end h[byte][rest] = action } end } if valid_encoding valid_encoding.each_firstbyte {|byte, rest| if h[byte] am = ActionMap.new(h[byte]) yield byte, am, rest else am = ActionMap.new(rest => :undef) yield byte, am, nil end } else h.keys.sort.each {|byte| am = ActionMap.new(h[byte]) yield byte, am, nil } end end OffsetsMemo = {} InfosMemo = {} def format_offsets(offsets) code = "{\n" 0.step(offsets.length-1,16) {|i| code << " " code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') code << " " code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') code << "\n" } code << '}' code end def generate_info(info) case info when :nomap "NOMAP" when :undef "UNDEF" when :invalid "INVALID" when :func_so "FUNso" when /\A([0-9a-f][0-9a-f])\z/i "o1(0x#$1)" when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i "o2(0x#$1,0x#$2)" when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i "o3(0x#$1,0x#$2,0x#$3)" when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i "o4(0x#$1,0x#$2,0x#$3,0x#$4)" else info.to_s end end def format_infos(infos) infos = infos.map {|info| generate_info(info) } maxlen = infos.map {|info| info.length }.max columns = maxlen <= 16 ? 4 : 2 code = "{\n" 0.step(infos.length-1, columns) {|i| code << " " is = infos[i,columns] is.each {|info| code << sprintf(" %#{maxlen}s,", info) } code << "\n" } code << "}" code end def generate_lookup_node(name, table) offsets = [] infos = [] infomap = {} table.each_with_index {|action, byte| action ||= :invalid unless o = infomap[action] infomap[action] = o = infos.length infos[o] = action end offsets[byte] = o } if n = OffsetsMemo[offsets] offsets_name = n offsets_code = '' else offsets_name = "#{name}_offsets" offsets_code = <<"End" static const unsigned char #{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)}; End OffsetsMemo[offsets] = offsets_name end if n = InfosMemo[infos] infos_name = n infos_code = '' else infos_name = "#{name}_infos" infos_code = <<"End" static const struct byte_lookup* const #{infos_name}[#{infos.length}] = #{format_infos(infos)}; End InfosMemo[infos] = infos_name end r = offsets_code + infos_code + <<"End" static const BYTE_LOOKUP #{name} = { #{offsets_name}, #{infos_name} }; End r end PreMemo = {} PostMemo = {} NextName = "a" def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil) ranges = [0x00..0xff] if ranges.empty? range = ranges.first if n = PreMemo[[self,valid_encoding]] return n end table = Array.new(range.end - range.begin + 1) each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding| unless range === byte raise "byte not in range" end if a = rest.empty_action table[byte-range.begin] = a else name_hint2 = nil name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding) end } if n = PostMemo[table] return n end if !name_hint name_hint = "fun_" + NextName.dup NextName.succ! end PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint code << generate_lookup_node(name_hint, table) name_hint end end def encode_utf8(map) r = [] map.each {|k, v| # integer means UTF-8 encoded sequence. k = [k].pack("U").unpack("H*")[0].upcase if Integer === k v = [v].pack("U").unpack("H*")[0].upcase if Integer === v r << [k,v] } r end def transcode_compile_tree(name, from, map) map = encode_utf8(map) h = {} map.each {|k, v| h[k] = v } am = ActionMap.parse(h) if ValidEncoding[from] valid_encoding = StrSet.parse(ValidEncoding[from]) else valid_encoding = nil end ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : [] code = '' defined_name = am.generate_node(code, name, ranges, valid_encoding) return defined_name, code end TRANSCODERS = [] def transcode_tblgen(from, to, map) STDERR.puts "converter for #{from} to #{to}" if VERBOSE_MODE id_from = from.tr('^0-9A-Za-z', '_') id_to = to.tr('^0-9A-Za-z', '_') if from == "UTF-8" tree_name = "to_#{id_to}" elsif to == "UTF-8" tree_name = "from_#{id_from}" else tree_name = "from_#{id_from}_to_#{id_to}" end map = encode_utf8(map) real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map) transcoder_name = "rb_#{tree_name}" TRANSCODERS << transcoder_name from_utf8 = from == 'UTF-8' ? 1 : 0 max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max transcoder_code = <<"End" static const rb_transcoder #{transcoder_name} = { #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8}, NULL, NULL, }; End tree_code + "\n" + transcoder_code end def transcode_generate_node(am, code, name_hint=nil, ranges=[]) STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE am.generate_node(code, name_hint, ranges) end def transcode_register_code code = '' TRANSCODERS.each {|transcoder_name| code << " rb_register_transcoder(&#{transcoder_name});\n" } code end ValidEncoding = { '1byte' => '{00-ff}', '2byte' => '{00-ff}{00-ff}', '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}', 'US-ASCII' => '{00-7f}', 'UTF-8' => '{00-7f} {c2-df}{80-bf} e0{a0-bf}{80-bf} {e1-ec}{80-bf}{80-bf} ed{80-9f}{80-bf} {ee-ef}{80-bf}{80-bf} f0{90-bf}{80-bf}{80-bf} {f1-f3}{80-bf}{80-bf}{80-bf} f4{80-8f}{80-bf}{80-bf}', 'UTF-16BE' => '{00-d7,e0-ff}{00-ff} {d8-db}{00-ff}{dc-df}{00-ff}', 'UTF-16LE' => '{00-ff}{00-d7,e0-ff} {00-ff}{d8-db}{00-ff}{dc-df}', 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff} 00{01-10}{00-ff}{00-ff}', 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000 {00-ff}{00-ff}{01-10}00', 'EUC-JP' => '{00-7f} {a1-fe}{a1-fe} 8e{a1-fe} 8f{a1-fe}{a1-fe}', 'CP51932' => '{00-7f} {a1-fe}{a1-fe} 8e{a1-fe}', 'Shift_JIS' => '{00-7f} {81-9f,e0-fc}{40-7e,80-fc} {a1-df}', 'EUC-KR' => '{00-7f} {a1-fe}{a1-fe}', 'CP949' => '{00-7f} {81-fe}{41-5a,61-7a,81-fe}', 'Big5' => '{00-7f} {81-fe}{40-7e,a1-fe}', 'EUC-TW' => '{00-7f} {a1-fe}{a1-fe} 8e{a1-b0}{a1-fe}{a1-fe}', 'GBK' => '{00-80} {81-fe}{40-7e,80-fe}', 'GB18030' => '{00-7f} {81-fe}{40-7e,80-fe} {81-fe}{30-39}{81-fe}{30-39}', } { 'ISO-8859-1' => '1byte', 'ISO-8859-2' => '1byte', 'ISO-8859-3' => '1byte', 'ISO-8859-4' => '1byte', 'ISO-8859-5' => '1byte', 'ISO-8859-6' => '1byte', 'ISO-8859-7' => '1byte', 'ISO-8859-8' => '1byte', 'ISO-8859-9' => '1byte', 'ISO-8859-10' => '1byte', 'ISO-8859-11' => '1byte', 'ISO-8859-13' => '1byte', 'ISO-8859-14' => '1byte', 'ISO-8859-15' => '1byte', 'Windows-31J' => 'Shift_JIS', }.each {|k, v| ValidEncoding[k] = ValidEncoding.fetch(v) } def make_signature(filename, src) "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}" end output_filename = nil verbose_mode = false force_mode = false op = OptionParser.new op.def_option("--help", "show help message") { puts op; exit 0 } op.def_option("--verbose", "verbose mode") { verbose_mode = true } op.def_option("--force", "force table generation") { force_mode = true } op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } op.parse! VERBOSE_MODE = verbose_mode arg = ARGV.shift dir = File.dirname(arg) $:.unshift dir unless $:.include? dir src = File.read(arg) src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding base_signature = "/* autogenerated. */\n" base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" if !force_mode && output_filename && File.readable?(output_filename) old_signature = File.open(output_filename) {|f| f.gets("").chomp } chk_signature = base_signature.dup old_signature.each_line {|line| if %r{/\* src="([0-9a-z_.-]+)",} =~ line name = $1 next if name == File.basename(arg) path = File.join(dir, name) if File.readable? path chk_signature << "/* #{make_signature(name, File.read(path))} */\n" end end } if old_signature == chk_signature && File.mtime(__FILE__) < File.mtime(output_filename) now = Time.now File.utime(now, now, output_filename) STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE exit end end if VERBOSE_MODE if output_filename STDERR.puts "generating #{output_filename} ..." end end libs1 = $".dup erb_result = ERB.new(src, nil, '%').result(binding) libs2 = $".dup libs = libs2 - libs1 lib_sigs = '' libs.each {|lib| lib = File.basename(lib) path = File.join(dir, lib) if File.readable? path lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" end } result = '' result << base_signature result << lib_sigs result << "\n" result << erb_result result << "\n" if output_filename new_filename = output_filename + ".new" File.open(new_filename, "w") {|f| f << result } File.rename(new_filename, output_filename) STDERR.puts "done." if VERBOSE_MODE else print result end