diff options
Diffstat (limited to 'trunk/tool/transcode-tblgen.rb')
-rw-r--r-- | trunk/tool/transcode-tblgen.rb | 637 |
1 files changed, 0 insertions, 637 deletions
diff --git a/trunk/tool/transcode-tblgen.rb b/trunk/tool/transcode-tblgen.rb deleted file mode 100644 index f79fc551ec..0000000000 --- a/trunk/tool/transcode-tblgen.rb +++ /dev/null @@ -1,637 +0,0 @@ -require 'optparse' -require 'erb' -require 'fileutils' - -C_ESC = { - "\\" => "\\\\", - '"' => '\"', - "\n" => '\n', -} - -0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch } -0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch } -C_ESC_PAT = Regexp.union(*C_ESC.keys) - -def c_esc(str) - '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"' -end - -class StrSet - def self.parse(pattern) - if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern - raise ArgumentError, "invalid pattern: #{pattern.inspect}" - end - result = [] - pattern.scan(/\S+/) {|seq| - seq_result = [] - while !seq.empty? - if /\A([0-9a-f][0-9a-f])/i =~ seq - byte = $1.to_i(16) - seq_result << [byte..byte] - seq = $' - elsif /\A\{([^\}]+)\}/ =~ seq - set = $1 - seq = $' - set_result = [] - set.scan(/[^,]+/) {|range| - if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range - b = $1.to_i(16) - e = $2.to_i(16) - set_result << (b..e) - elsif /\A([0-9a-f][0-9a-f])\z/ =~ range - byte = $1.to_i(16) - set_result << (byte..byte) - else - raise "invalid range: #{range.inspect}" - end - } - seq_result << set_result - else - raise "invalid sequence: #{seq.inspect}" - end - end - result << seq_result - } - self.new(result) - end - - def initialize(pat) - @pat = pat - end - - def hash - @pat.hash - end - - def eql?(other) - self.class == other.class && - @pat == other.instance_eval { @pat } - end - - alias == eql? - - def to_s - if @pat.empty? - "(empset)" - else - @pat.map {|seq| - if seq.empty? - "(empstr)" - else - seq.map {|byteset| - if byteset.length == 1 && byteset[0].begin == byteset[0].end - "%02x" % byteset[0].begin - else - "{" + - byteset.map {|range| - if range.begin == range.end - "%02x" % range.begin - else - "%02x-%02x" % [range.begin, range.end] - end - }.join(',') + - "}" - end - }.join('') - end - }.join(' ') - end - end - - def inspect - "\#<#{self.class}: #{self.to_s}>" - end - - def min_length - if @pat.empty? - nil - else - @pat.map {|seq| seq.length }.min - end - end - - def max_length - if @pat.empty? - nil - else - @pat.map {|seq| seq.length }.max - end - end - - def emptyable? - @pat.any? {|seq| - seq.empty? - } - end - - def first_bytes - result = {} - @pat.each {|seq| - next if seq.empty? - seq.first.each {|range| - range.each {|byte| - result[byte] = true - } - } - } - result.keys.sort - end - - def each_firstbyte - h = {} - @pat.each {|seq| - next if seq.empty? - seq.first.each {|range| - range.each {|byte| - (h[byte] ||= []) << seq[1..-1] - } - } - } - h.keys.sort.each {|byte| - yield byte, StrSet.new(h[byte]) - } - end -end - -class ActionMap - def self.parse(hash) - h = {} - hash.each {|pat, action| - h[StrSet.parse(pat)] = action - } - self.new(h) - end - - def initialize(h) - @map = h - end - - def hash - hash = 0 - @map.each {|k,v| - hash ^= k.hash ^ v.hash - } - hash - end - - def eql?(other) - self.class == other.class && - @map == other.instance_eval { @map } - end - - alias == eql? - - def inspect - "\#<#{self.class}:" + - @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') + - ">" - end - - def max_input_length - @map.keys.map {|k| k.max_length }.max - end - - def empty_action - @map.each {|ss, action| - return action if ss.emptyable? - } - nil - end - - def each_firstbyte(valid_encoding=nil) - h = {} - @map.each {|ss, action| - if ss.emptyable? - raise "emptyable pattern" - else - ss.each_firstbyte {|byte, rest| - h[byte] ||= {} - if h[byte][rest] - raise "ambiguous" - end - h[byte][rest] = action - } - end - } - if valid_encoding - valid_encoding.each_firstbyte {|byte, rest| - if h[byte] - am = ActionMap.new(h[byte]) - yield byte, am, rest - else - am = ActionMap.new(rest => :undef) - yield byte, am, nil - end - } - else - h.keys.sort.each {|byte| - am = ActionMap.new(h[byte]) - yield byte, am, nil - } - end - end - - OffsetsMemo = {} - InfosMemo = {} - - def format_offsets(min, max, offsets) - offsets = offsets[min..max] - code = "{ %d, %d,\n" % [min, max] - 0.step(offsets.length-1,16) {|i| - code << " " - code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') - if i+8 < offsets.length - code << " " - code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') - end - code << "\n" - } - code << '}' - code - end - - def generate_info(info) - case info - when :nomap - "NOMAP" - when :undef - "UNDEF" - when :invalid - "INVALID" - when :func_ii - "FUNii" - when :func_si - "FUNsi" - when :func_io - "FUNio" - when :func_so - "FUNso" - when /\A([0-9a-f][0-9a-f])\z/i - "o1(0x#$1)" - when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i - "o2(0x#$1,0x#$2)" - when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i - "o3(0x#$1,0x#$2,0x#$3)" - when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i - "o4(0x#$1,0x#$2,0x#$3,0x#$4)" - when /\A&/ # pointer to BYTE_LOOKUP structure - info.to_s - else - raise "unexpected action: #{info.inspect}" - end - end - - def format_infos(infos) - infos = infos.map {|info| generate_info(info) } - maxlen = infos.map {|info| info.length }.max - columns = maxlen <= 16 ? 4 : 2 - code = "{\n" - 0.step(infos.length-1, columns) {|i| - code << " " - is = infos[i,columns] - is.each {|info| - code << sprintf(" %#{maxlen}s,", info) - } - code << "\n" - } - code << "}" - code - end - - def generate_lookup_node(name, table) - offsets = [] - infos = [] - infomap = {} - min = max = nil - table.each_with_index {|action, byte| - action ||= :invalid - if action != :invalid - min = byte if !min - max = byte - end - unless o = infomap[action] - infomap[action] = o = infos.length - infos[o] = action - end - offsets[byte] = o - } - if !min - min = max = 0 - end - - offsets_key = [min, max, offsets[min..max]] - if n = OffsetsMemo[offsets_key] - offsets_name = n - offsets_code = '' - else - offsets_name = "#{name}_offsets" - offsets_code = <<"End" -static const unsigned char -#{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)}; -End - OffsetsMemo[offsets_key] = offsets_name - end - - if n = InfosMemo[infos] - infos_name = n - infos_code = '' - else - infos_name = "#{name}_infos" - infos_code = <<"End" -static const struct byte_lookup* const -#{infos_name}[#{infos.length}] = #{format_infos(infos)}; -End - InfosMemo[infos] = infos_name - end - - r = offsets_code + infos_code + <<"End" -static const BYTE_LOOKUP -#{name} = { - #{offsets_name}, - #{infos_name} -}; - -End - r - end - - PreMemo = {} - PostMemo = {} - NextName = "a" - - def generate_node(code, name_hint=nil, valid_encoding=nil) - if n = PreMemo[[self,valid_encoding]] - return n - end - - table = Array.new(0x100, :invalid) - each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding| - if a = rest.empty_action - table[byte] = a - else - name_hint2 = nil - name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint - table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding) - end - } - - if n = PostMemo[table] - return n - end - - if !name_hint - name_hint = "fun_" + NextName.dup - NextName.succ! - end - - PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint - - code << generate_lookup_node(name_hint, table) - name_hint - end -end - -def encode_utf8(map) - r = [] - map.each {|k, v| - # integer means UTF-8 encoded sequence. - k = [k].pack("U").unpack("H*")[0].upcase if Integer === k - v = [v].pack("U").unpack("H*")[0].upcase if Integer === v - r << [k,v] - } - r -end - -def transcode_compile_tree(name, from, map) - map = encode_utf8(map) - h = {} - map.each {|k, v| - h[k] = v - } - am = ActionMap.parse(h) - - max_input = am.max_input_length - - if ValidEncoding[from] - valid_encoding = StrSet.parse(ValidEncoding[from]) - else - valid_encoding = nil - end - - code = '' - defined_name = am.generate_node(code, name, valid_encoding) - return defined_name, code, max_input -end - -TRANSCODERS = [] - -def transcode_tblgen(from, to, map) - STDERR.puts "converter from #{from} to #{to}" if VERBOSE_MODE - id_from = from.tr('^0-9A-Za-z', '_') - id_to = to.tr('^0-9A-Za-z', '_') - if from == "UTF-8" - tree_name = "to_#{id_to}" - elsif to == "UTF-8" - tree_name = "from_#{id_from}" - else - tree_name = "from_#{id_from}_to_#{id_to}" - end - map = encode_utf8(map) - real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map) - transcoder_name = "rb_#{tree_name}" - TRANSCODERS << transcoder_name - input_unit_length = UnitLength[from] - max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max - transcoder_code = <<"End" -static const rb_transcoder -#{transcoder_name} = { - #{c_esc from}, #{c_esc to}, &#{real_tree_name}, - #{input_unit_length}, /* input_unit_length */ - #{max_input}, /* max_input */ - #{max_output}, /* max_output */ - stateless_converter, /* stateful_type */ - NULL, NULL, NULL, NULL, - NULL, NULL, NULL -}; -End - tree_code + "\n" + transcoder_code -end - -def transcode_generate_node(am, name_hint=nil) - STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE - code = '' - am.generate_node(code, name_hint) - code -end - -def transcode_register_code - code = '' - TRANSCODERS.each {|transcoder_name| - code << " rb_register_transcoder(&#{transcoder_name});\n" - } - code -end - -UnitLength = { - 'UTF-16BE' => 2, - 'UTF-16LE' => 2, - 'UTF-32BE' => 4, - 'UTF-32LE' => 4, -} -UnitLength.default = 1 - -ValidEncoding = { - '1byte' => '{00-ff}', - '2byte' => '{00-ff}{00-ff}', - '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}', - 'US-ASCII' => '{00-7f}', - 'UTF-8' => '{00-7f} - {c2-df}{80-bf} - e0{a0-bf}{80-bf} - {e1-ec}{80-bf}{80-bf} - ed{80-9f}{80-bf} - {ee-ef}{80-bf}{80-bf} - f0{90-bf}{80-bf}{80-bf} - {f1-f3}{80-bf}{80-bf}{80-bf} - f4{80-8f}{80-bf}{80-bf}', - 'UTF-16BE' => '{00-d7,e0-ff}{00-ff} - {d8-db}{00-ff}{dc-df}{00-ff}', - 'UTF-16LE' => '{00-ff}{00-d7,e0-ff} - {00-ff}{d8-db}{00-ff}{dc-df}', - 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff} - 00{01-10}{00-ff}{00-ff}', - 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000 - {00-ff}{00-ff}{01-10}00', - 'EUC-JP' => '{00-7f} - {a1-fe}{a1-fe} - 8e{a1-fe} - 8f{a1-fe}{a1-fe}', - 'CP51932' => '{00-7f} - {a1-fe}{a1-fe} - 8e{a1-fe}', - 'Shift_JIS' => '{00-7f} - {81-9f,e0-fc}{40-7e,80-fc} - {a1-df}', - 'EUC-KR' => '{00-7f} - {a1-fe}{a1-fe}', - 'CP949' => '{00-7f} - {81-fe}{41-5a,61-7a,81-fe}', - 'Big5' => '{00-7f} - {81-fe}{40-7e,a1-fe}', - 'EUC-TW' => '{00-7f} - {a1-fe}{a1-fe} - 8e{a1-b0}{a1-fe}{a1-fe}', - 'GBK' => '{00-80} - {81-fe}{40-7e,80-fe}', - 'GB18030' => '{00-7f} - {81-fe}{40-7e,80-fe} - {81-fe}{30-39}{81-fe}{30-39}', -} - -{ - 'ASCII-8BIT' => '1byte', - 'ISO-8859-1' => '1byte', - 'ISO-8859-2' => '1byte', - 'ISO-8859-3' => '1byte', - 'ISO-8859-4' => '1byte', - 'ISO-8859-5' => '1byte', - 'ISO-8859-6' => '1byte', - 'ISO-8859-7' => '1byte', - 'ISO-8859-8' => '1byte', - 'ISO-8859-9' => '1byte', - 'ISO-8859-10' => '1byte', - 'ISO-8859-11' => '1byte', - 'ISO-8859-13' => '1byte', - 'ISO-8859-14' => '1byte', - 'ISO-8859-15' => '1byte', - 'Windows-31J' => 'Shift_JIS', -}.each {|k, v| - ValidEncoding[k] = ValidEncoding.fetch(v) -} - -def make_signature(filename, src) - "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}" -end - -output_filename = nil -verbose_mode = false -force_mode = false - -op = OptionParser.new -op.def_option("--help", "show help message") { puts op; exit 0 } -op.def_option("--verbose", "verbose mode") { verbose_mode = true } -op.def_option("--force", "force table generation") { force_mode = true } -op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg } -op.parse! - -VERBOSE_MODE = verbose_mode - -arg = ARGV.shift -dir = File.dirname(arg) -$:.unshift dir unless $:.include? dir -src = File.read(arg) -src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding -this_script = File.read(__FILE__) -this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding - -base_signature = "/* autogenerated. */\n" -base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n" -base_signature << "/* #{make_signature(File.basename(arg), src)} */\n" - -if !force_mode && output_filename && File.readable?(output_filename) - old_signature = File.open(output_filename) {|f| f.gets("").chomp } - chk_signature = base_signature.dup - old_signature.each_line {|line| - if %r{/\* src="([0-9a-z_.-]+)",} =~ line - name = $1 - next if name == File.basename(arg) || name == File.basename(__FILE__) - path = File.join(dir, name) - if File.readable? path - chk_signature << "/* #{make_signature(name, File.read(path))} */\n" - end - end - } - if old_signature == chk_signature - now = Time.now - File.utime(now, now, output_filename) - STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE - exit - end -end - -if VERBOSE_MODE - if output_filename - STDERR.puts "generating #{output_filename} ..." - end -end - -libs1 = $".dup -erb_result = ERB.new(src, nil, '%').result(binding) -libs2 = $".dup - -libs = libs2 - libs1 -lib_sigs = '' -libs.each {|lib| - lib = File.basename(lib) - path = File.join(dir, lib) - if File.readable? path - lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n" - end -} - -result = '' -result << base_signature -result << lib_sigs -result << "\n" -result << erb_result -result << "\n" - -if output_filename - new_filename = output_filename + ".new" - FileUtils.mkdir_p(File.dirname(output_filename)) - File.open(new_filename, "wb") {|f| f << result } - File.rename(new_filename, output_filename) - STDERR.puts "done." if VERBOSE_MODE -else - print result -end |