summaryrefslogtreecommitdiff
path: root/trunk/tool/transcode-tblgen.rb
diff options
context:
space:
mode:
authoryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-25 15:02:05 +0000
committeryugui <yugui@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-25 15:02:05 +0000
commit0dc342de848a642ecce8db697b8fecd83a63e117 (patch)
tree2b7ed4724aff1f86073e4740134bda9c4aac1a39 /trunk/tool/transcode-tblgen.rb
parentef70cf7138ab8034b5b806f466e4b484b24f0f88 (diff)
added tag v1_9_0_4
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/tags/v1_9_0_4@18845 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'trunk/tool/transcode-tblgen.rb')
-rw-r--r--trunk/tool/transcode-tblgen.rb637
1 files changed, 637 insertions, 0 deletions
diff --git a/trunk/tool/transcode-tblgen.rb b/trunk/tool/transcode-tblgen.rb
new file mode 100644
index 0000000000..f79fc551ec
--- /dev/null
+++ b/trunk/tool/transcode-tblgen.rb
@@ -0,0 +1,637 @@
+require 'optparse'
+require 'erb'
+require 'fileutils'
+
+C_ESC = {
+ "\\" => "\\\\",
+ '"' => '\"',
+ "\n" => '\n',
+}
+
+0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
+0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
+C_ESC_PAT = Regexp.union(*C_ESC.keys)
+
+def c_esc(str)
+ '"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
+end
+
+class StrSet
+ def self.parse(pattern)
+ if /\A\s*(([0-9a-f][0-9a-f]|\{([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f])(,([0-9a-f][0-9a-f]|[0-9a-f][0-9a-f]-[0-9a-f][0-9a-f]))*\})+(\s+|\z))*\z/i !~ pattern
+ raise ArgumentError, "invalid pattern: #{pattern.inspect}"
+ end
+ result = []
+ pattern.scan(/\S+/) {|seq|
+ seq_result = []
+ while !seq.empty?
+ if /\A([0-9a-f][0-9a-f])/i =~ seq
+ byte = $1.to_i(16)
+ seq_result << [byte..byte]
+ seq = $'
+ elsif /\A\{([^\}]+)\}/ =~ seq
+ set = $1
+ seq = $'
+ set_result = []
+ set.scan(/[^,]+/) {|range|
+ if /\A([0-9a-f][0-9a-f])-([0-9a-f][0-9a-f])\z/ =~ range
+ b = $1.to_i(16)
+ e = $2.to_i(16)
+ set_result << (b..e)
+ elsif /\A([0-9a-f][0-9a-f])\z/ =~ range
+ byte = $1.to_i(16)
+ set_result << (byte..byte)
+ else
+ raise "invalid range: #{range.inspect}"
+ end
+ }
+ seq_result << set_result
+ else
+ raise "invalid sequence: #{seq.inspect}"
+ end
+ end
+ result << seq_result
+ }
+ self.new(result)
+ end
+
+ def initialize(pat)
+ @pat = pat
+ end
+
+ def hash
+ @pat.hash
+ end
+
+ def eql?(other)
+ self.class == other.class &&
+ @pat == other.instance_eval { @pat }
+ end
+
+ alias == eql?
+
+ def to_s
+ if @pat.empty?
+ "(empset)"
+ else
+ @pat.map {|seq|
+ if seq.empty?
+ "(empstr)"
+ else
+ seq.map {|byteset|
+ if byteset.length == 1 && byteset[0].begin == byteset[0].end
+ "%02x" % byteset[0].begin
+ else
+ "{" +
+ byteset.map {|range|
+ if range.begin == range.end
+ "%02x" % range.begin
+ else
+ "%02x-%02x" % [range.begin, range.end]
+ end
+ }.join(',') +
+ "}"
+ end
+ }.join('')
+ end
+ }.join(' ')
+ end
+ end
+
+ def inspect
+ "\#<#{self.class}: #{self.to_s}>"
+ end
+
+ def min_length
+ if @pat.empty?
+ nil
+ else
+ @pat.map {|seq| seq.length }.min
+ end
+ end
+
+ def max_length
+ if @pat.empty?
+ nil
+ else
+ @pat.map {|seq| seq.length }.max
+ end
+ end
+
+ def emptyable?
+ @pat.any? {|seq|
+ seq.empty?
+ }
+ end
+
+ def first_bytes
+ result = {}
+ @pat.each {|seq|
+ next if seq.empty?
+ seq.first.each {|range|
+ range.each {|byte|
+ result[byte] = true
+ }
+ }
+ }
+ result.keys.sort
+ end
+
+ def each_firstbyte
+ h = {}
+ @pat.each {|seq|
+ next if seq.empty?
+ seq.first.each {|range|
+ range.each {|byte|
+ (h[byte] ||= []) << seq[1..-1]
+ }
+ }
+ }
+ h.keys.sort.each {|byte|
+ yield byte, StrSet.new(h[byte])
+ }
+ end
+end
+
+class ActionMap
+ def self.parse(hash)
+ h = {}
+ hash.each {|pat, action|
+ h[StrSet.parse(pat)] = action
+ }
+ self.new(h)
+ end
+
+ def initialize(h)
+ @map = h
+ end
+
+ def hash
+ hash = 0
+ @map.each {|k,v|
+ hash ^= k.hash ^ v.hash
+ }
+ hash
+ end
+
+ def eql?(other)
+ self.class == other.class &&
+ @map == other.instance_eval { @map }
+ end
+
+ alias == eql?
+
+ def inspect
+ "\#<#{self.class}:" +
+ @map.map {|k, v| " [" + k.to_s + "]=>" + v.inspect }.join('') +
+ ">"
+ end
+
+ def max_input_length
+ @map.keys.map {|k| k.max_length }.max
+ end
+
+ def empty_action
+ @map.each {|ss, action|
+ return action if ss.emptyable?
+ }
+ nil
+ end
+
+ def each_firstbyte(valid_encoding=nil)
+ h = {}
+ @map.each {|ss, action|
+ if ss.emptyable?
+ raise "emptyable pattern"
+ else
+ ss.each_firstbyte {|byte, rest|
+ h[byte] ||= {}
+ if h[byte][rest]
+ raise "ambiguous"
+ end
+ h[byte][rest] = action
+ }
+ end
+ }
+ if valid_encoding
+ valid_encoding.each_firstbyte {|byte, rest|
+ if h[byte]
+ am = ActionMap.new(h[byte])
+ yield byte, am, rest
+ else
+ am = ActionMap.new(rest => :undef)
+ yield byte, am, nil
+ end
+ }
+ else
+ h.keys.sort.each {|byte|
+ am = ActionMap.new(h[byte])
+ yield byte, am, nil
+ }
+ end
+ end
+
+ OffsetsMemo = {}
+ InfosMemo = {}
+
+ def format_offsets(min, max, offsets)
+ offsets = offsets[min..max]
+ code = "{ %d, %d,\n" % [min, max]
+ 0.step(offsets.length-1,16) {|i|
+ code << " "
+ code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
+ if i+8 < offsets.length
+ code << " "
+ code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+ end
+ code << "\n"
+ }
+ code << '}'
+ code
+ end
+
+ def generate_info(info)
+ case info
+ when :nomap
+ "NOMAP"
+ when :undef
+ "UNDEF"
+ when :invalid
+ "INVALID"
+ when :func_ii
+ "FUNii"
+ when :func_si
+ "FUNsi"
+ when :func_io
+ "FUNio"
+ when :func_so
+ "FUNso"
+ when /\A([0-9a-f][0-9a-f])\z/i
+ "o1(0x#$1)"
+ when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
+ "o2(0x#$1,0x#$2)"
+ when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
+ "o3(0x#$1,0x#$2,0x#$3)"
+ when /\A([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])([0-9a-f][0-9a-f])\z/i
+ "o4(0x#$1,0x#$2,0x#$3,0x#$4)"
+ when /\A&/ # pointer to BYTE_LOOKUP structure
+ info.to_s
+ else
+ raise "unexpected action: #{info.inspect}"
+ end
+ end
+
+ def format_infos(infos)
+ infos = infos.map {|info| generate_info(info) }
+ maxlen = infos.map {|info| info.length }.max
+ columns = maxlen <= 16 ? 4 : 2
+ code = "{\n"
+ 0.step(infos.length-1, columns) {|i|
+ code << " "
+ is = infos[i,columns]
+ is.each {|info|
+ code << sprintf(" %#{maxlen}s,", info)
+ }
+ code << "\n"
+ }
+ code << "}"
+ code
+ end
+
+ def generate_lookup_node(name, table)
+ offsets = []
+ infos = []
+ infomap = {}
+ min = max = nil
+ table.each_with_index {|action, byte|
+ action ||= :invalid
+ if action != :invalid
+ min = byte if !min
+ max = byte
+ end
+ unless o = infomap[action]
+ infomap[action] = o = infos.length
+ infos[o] = action
+ end
+ offsets[byte] = o
+ }
+ if !min
+ min = max = 0
+ end
+
+ offsets_key = [min, max, offsets[min..max]]
+ if n = OffsetsMemo[offsets_key]
+ offsets_name = n
+ offsets_code = ''
+ else
+ offsets_name = "#{name}_offsets"
+ offsets_code = <<"End"
+static const unsigned char
+#{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
+End
+ OffsetsMemo[offsets_key] = offsets_name
+ end
+
+ if n = InfosMemo[infos]
+ infos_name = n
+ infos_code = ''
+ else
+ infos_name = "#{name}_infos"
+ infos_code = <<"End"
+static const struct byte_lookup* const
+#{infos_name}[#{infos.length}] = #{format_infos(infos)};
+End
+ InfosMemo[infos] = infos_name
+ end
+
+ r = offsets_code + infos_code + <<"End"
+static const BYTE_LOOKUP
+#{name} = {
+ #{offsets_name},
+ #{infos_name}
+};
+
+End
+ r
+ end
+
+ PreMemo = {}
+ PostMemo = {}
+ NextName = "a"
+
+ def generate_node(code, name_hint=nil, valid_encoding=nil)
+ if n = PreMemo[[self,valid_encoding]]
+ return n
+ end
+
+ table = Array.new(0x100, :invalid)
+ each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
+ if a = rest.empty_action
+ table[byte] = a
+ else
+ name_hint2 = nil
+ name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
+ table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
+ end
+ }
+
+ if n = PostMemo[table]
+ return n
+ end
+
+ if !name_hint
+ name_hint = "fun_" + NextName.dup
+ NextName.succ!
+ end
+
+ PreMemo[[self,valid_encoding]] = PostMemo[table] = name_hint
+
+ code << generate_lookup_node(name_hint, table)
+ name_hint
+ end
+end
+
+def encode_utf8(map)
+ r = []
+ map.each {|k, v|
+ # integer means UTF-8 encoded sequence.
+ k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
+ v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
+ r << [k,v]
+ }
+ r
+end
+
+def transcode_compile_tree(name, from, map)
+ map = encode_utf8(map)
+ h = {}
+ map.each {|k, v|
+ h[k] = v
+ }
+ am = ActionMap.parse(h)
+
+ max_input = am.max_input_length
+
+ if ValidEncoding[from]
+ valid_encoding = StrSet.parse(ValidEncoding[from])
+ else
+ valid_encoding = nil
+ end
+
+ code = ''
+ defined_name = am.generate_node(code, name, valid_encoding)
+ return defined_name, code, max_input
+end
+
+TRANSCODERS = []
+
+def transcode_tblgen(from, to, map)
+ STDERR.puts "converter from #{from} to #{to}" if VERBOSE_MODE
+ id_from = from.tr('^0-9A-Za-z', '_')
+ id_to = to.tr('^0-9A-Za-z', '_')
+ if from == "UTF-8"
+ tree_name = "to_#{id_to}"
+ elsif to == "UTF-8"
+ tree_name = "from_#{id_from}"
+ else
+ tree_name = "from_#{id_from}_to_#{id_to}"
+ end
+ map = encode_utf8(map)
+ real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map)
+ transcoder_name = "rb_#{tree_name}"
+ TRANSCODERS << transcoder_name
+ input_unit_length = UnitLength[from]
+ max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
+ transcoder_code = <<"End"
+static const rb_transcoder
+#{transcoder_name} = {
+ #{c_esc from}, #{c_esc to}, &#{real_tree_name},
+ #{input_unit_length}, /* input_unit_length */
+ #{max_input}, /* max_input */
+ #{max_output}, /* max_output */
+ stateless_converter, /* stateful_type */
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL
+};
+End
+ tree_code + "\n" + transcoder_code
+end
+
+def transcode_generate_node(am, name_hint=nil)
+ STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
+ code = ''
+ am.generate_node(code, name_hint)
+ code
+end
+
+def transcode_register_code
+ code = ''
+ TRANSCODERS.each {|transcoder_name|
+ code << " rb_register_transcoder(&#{transcoder_name});\n"
+ }
+ code
+end
+
+UnitLength = {
+ 'UTF-16BE' => 2,
+ 'UTF-16LE' => 2,
+ 'UTF-32BE' => 4,
+ 'UTF-32LE' => 4,
+}
+UnitLength.default = 1
+
+ValidEncoding = {
+ '1byte' => '{00-ff}',
+ '2byte' => '{00-ff}{00-ff}',
+ '4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
+ 'US-ASCII' => '{00-7f}',
+ 'UTF-8' => '{00-7f}
+ {c2-df}{80-bf}
+ e0{a0-bf}{80-bf}
+ {e1-ec}{80-bf}{80-bf}
+ ed{80-9f}{80-bf}
+ {ee-ef}{80-bf}{80-bf}
+ f0{90-bf}{80-bf}{80-bf}
+ {f1-f3}{80-bf}{80-bf}{80-bf}
+ f4{80-8f}{80-bf}{80-bf}',
+ 'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
+ {d8-db}{00-ff}{dc-df}{00-ff}',
+ 'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
+ {00-ff}{d8-db}{00-ff}{dc-df}',
+ 'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
+ 00{01-10}{00-ff}{00-ff}',
+ 'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
+ {00-ff}{00-ff}{01-10}00',
+ 'EUC-JP' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-fe}
+ 8f{a1-fe}{a1-fe}',
+ 'CP51932' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-fe}',
+ 'Shift_JIS' => '{00-7f}
+ {81-9f,e0-fc}{40-7e,80-fc}
+ {a1-df}',
+ 'EUC-KR' => '{00-7f}
+ {a1-fe}{a1-fe}',
+ 'CP949' => '{00-7f}
+ {81-fe}{41-5a,61-7a,81-fe}',
+ 'Big5' => '{00-7f}
+ {81-fe}{40-7e,a1-fe}',
+ 'EUC-TW' => '{00-7f}
+ {a1-fe}{a1-fe}
+ 8e{a1-b0}{a1-fe}{a1-fe}',
+ 'GBK' => '{00-80}
+ {81-fe}{40-7e,80-fe}',
+ 'GB18030' => '{00-7f}
+ {81-fe}{40-7e,80-fe}
+ {81-fe}{30-39}{81-fe}{30-39}',
+}
+
+{
+ 'ASCII-8BIT' => '1byte',
+ 'ISO-8859-1' => '1byte',
+ 'ISO-8859-2' => '1byte',
+ 'ISO-8859-3' => '1byte',
+ 'ISO-8859-4' => '1byte',
+ 'ISO-8859-5' => '1byte',
+ 'ISO-8859-6' => '1byte',
+ 'ISO-8859-7' => '1byte',
+ 'ISO-8859-8' => '1byte',
+ 'ISO-8859-9' => '1byte',
+ 'ISO-8859-10' => '1byte',
+ 'ISO-8859-11' => '1byte',
+ 'ISO-8859-13' => '1byte',
+ 'ISO-8859-14' => '1byte',
+ 'ISO-8859-15' => '1byte',
+ 'Windows-31J' => 'Shift_JIS',
+}.each {|k, v|
+ ValidEncoding[k] = ValidEncoding.fetch(v)
+}
+
+def make_signature(filename, src)
+ "src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
+end
+
+output_filename = nil
+verbose_mode = false
+force_mode = false
+
+op = OptionParser.new
+op.def_option("--help", "show help message") { puts op; exit 0 }
+op.def_option("--verbose", "verbose mode") { verbose_mode = true }
+op.def_option("--force", "force table generation") { force_mode = true }
+op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
+op.parse!
+
+VERBOSE_MODE = verbose_mode
+
+arg = ARGV.shift
+dir = File.dirname(arg)
+$:.unshift dir unless $:.include? dir
+src = File.read(arg)
+src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
+this_script = File.read(__FILE__)
+this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
+
+base_signature = "/* autogenerated. */\n"
+base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
+base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
+
+if !force_mode && output_filename && File.readable?(output_filename)
+ old_signature = File.open(output_filename) {|f| f.gets("").chomp }
+ chk_signature = base_signature.dup
+ old_signature.each_line {|line|
+ if %r{/\* src="([0-9a-z_.-]+)",} =~ line
+ name = $1
+ next if name == File.basename(arg) || name == File.basename(__FILE__)
+ path = File.join(dir, name)
+ if File.readable? path
+ chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
+ end
+ end
+ }
+ if old_signature == chk_signature
+ now = Time.now
+ File.utime(now, now, output_filename)
+ STDERR.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE
+ exit
+ end
+end
+
+if VERBOSE_MODE
+ if output_filename
+ STDERR.puts "generating #{output_filename} ..."
+ end
+end
+
+libs1 = $".dup
+erb_result = ERB.new(src, nil, '%').result(binding)
+libs2 = $".dup
+
+libs = libs2 - libs1
+lib_sigs = ''
+libs.each {|lib|
+ lib = File.basename(lib)
+ path = File.join(dir, lib)
+ if File.readable? path
+ lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
+ end
+}
+
+result = ''
+result << base_signature
+result << lib_sigs
+result << "\n"
+result << erb_result
+result << "\n"
+
+if output_filename
+ new_filename = output_filename + ".new"
+ FileUtils.mkdir_p(File.dirname(output_filename))
+ File.open(new_filename, "wb") {|f| f << result }
+ File.rename(new_filename, output_filename)
+ STDERR.puts "done." if VERBOSE_MODE
+else
+ print result
+end