diff options
Diffstat (limited to 'tool/enc-unicode.rb')
-rwxr-xr-x | tool/enc-unicode.rb | 132 |
1 files changed, 86 insertions, 46 deletions
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb index 93f6e869f8..9d49f427bb 100755 --- a/tool/enc-unicode.rb +++ b/tool/enc-unicode.rb @@ -5,14 +5,30 @@ # # To use this, get UnicodeData.txt, Scripts.txt, PropList.txt, # PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt, -# DerivedAge.txt and Blocks.txt from unicode.org. +# DerivedAge.txt, Blocks.txt, emoji/emoji-data.txt, +# auxiliary/GraphemeBreakProperty.txt from unicode.org # (http://unicode.org/Public/UNIDATA/) And run following command. -# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd +# tool/enc-unicode.rb data_dir emoji_data_dir > enc/unicode/name2ctype.kwd # You can get source file for gperf. After this, simply make ruby. - -if ARGV[0] == "--header" - header = true - ARGV.shift +# Or directly run: +# tool/enc-unicode.rb --header data_dir emoji_data_dir > enc/unicode/<VERSION>/name2ctype.h + +while arg = ARGV.shift + case arg + when "--" + break + when "--header" + header = true + when "--diff" + diff = ARGV.shift or abort "#{$0}: --diff=DIFF-COMMAND" + when /\A--diff=(.+)/m + diff = $1 + when /\A-/ + abort "#{$0}: unknown option #{arg}" + else + ARGV.unshift(arg) + break + end end unless ARGV.size == 2 abort "Usage: #{$0} data_directory emoji_data_directory" @@ -59,7 +75,7 @@ def parse_unicode_data(file) data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [], 'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []} beg_cp = nil - IO.foreach(file) do |line| + File.foreach(file) do |line| fields = line.split(';') cp = fields[0].to_i(16) @@ -150,7 +166,7 @@ def parse_scripts(data, categories) categories[current] = file[:title] (names[file[:title]] ||= []) << current cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line current = $3 $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) end @@ -205,7 +221,7 @@ def parse_age(data) ages << current last_constname = constname cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line + elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\d+\.\d+)/ =~ line current = $3 $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) end @@ -224,7 +240,7 @@ def parse_GraphemeBreakProperty(data) make_const(constname, cps, "Grapheme_Cluster_Break=#{current}") ages << current cps = [] - elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line + elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line current = $3 $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16)) end @@ -236,7 +252,7 @@ def parse_block(data) cps = [] blocks = [] data_foreach('Blocks.txt') do |line| - if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line + if /^(\h+)\.\.(\h+);\s*(.*)/ =~ line cps = ($1.to_i(16)..$2.to_i(16)).to_a constname = constantize_blockname($3) data[constname] = cps @@ -253,23 +269,12 @@ def parse_block(data) blocks << constname end -# shim for Ruby 1.8 -unless {}.respond_to?(:key) - class Hash - alias key index - end -end - $const_cache = {} # make_const(property, pairs, name): Prints a 'static const' structure for a # given property, group of paired codepoints, and a human-friendly name for # the group def make_const(prop, data, name) - if name.empty? - puts "\n/* '#{prop}' */" - else - puts "\n/* '#{prop}': #{name} */" - end + puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation if origprop = $const_cache.key(data) puts "#define CR_#{prop} CR_#{origprop}" else @@ -311,18 +316,19 @@ end def data_foreach(name, &block) fn = get_file(name) warn "Reading #{name}" - if /^emoji/ =~ name - sep = "" - pat = /^# #{Regexp.quote(File.basename(name))}.*^# Version: ([\d.]+)/m - type = :Emoji - else - sep = "\n" - pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/ - type = :Unicode - end File.open(fn, 'rb') do |f| - line = f.gets(sep) - unless version = line[pat, 1] + if /^emoji/ =~ name + line = f.gets("") + # Headers till Emoji 13 or 15 + version = line[/^# #{Regexp.quote(File.basename(name))}.*(?:^# Version:|Emoji Version) ([\d.]+)/m, 1] + type = :Emoji + else + # Headers since Emoji 14 or other Unicode data + line = f.gets("\n") + type = :Unicode + end + version ||= line[/^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/, 1] + unless version raise ArgumentError, <<-ERROR #{name}: no #{type} version #{line.gsub(/^/, '> ')} @@ -330,7 +336,7 @@ def data_foreach(name, &block) end if !(v = $versions[type]) $versions[type] = version - elsif v != version + elsif v != version and "#{v}.0" != version raise ArgumentError, <<-ERROR #{name}: #{type} version mismatch: #{version} to #{v} #{line.gsub(/^/, '> ')} @@ -420,8 +426,6 @@ define_posix_props(data) POSIX_NAMES.each do |name| if name == 'XPosixPunct' make_const(name, data[name], "[[:Punct:]]") - elsif name == 'Punct' - make_const(name, data[name], "") else make_const(name, data[name], "[[:#{name}:]]") end @@ -464,11 +468,7 @@ struct uniname2ctype_struct { }; #define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str) -static const struct uniname2ctype_struct *uniname2ctype_p( -#if !(/*ANSI*/+0) /* if ANSI, old style not to conflict with generated prototype */ - const char *, unsigned int -#endif -); +static const struct uniname2ctype_struct *uniname2ctype_p(register const char *str, register size_t len); %} struct uniname2ctype_struct; %% @@ -547,6 +547,47 @@ output.restore if header require 'tempfile' + def diff_args(diff) + ok = IO.popen([diff, "-DDIFF_TEST", IO::NULL, "-"], "r+") do |f| + f.puts "Test for diffutils 3.8" + f.close_write + /^#if/ =~ f.read + end + if ok + proc {|macro, *inputs| + [diff, "-D#{macro}", *inputs] + } + else + IO.popen([diff, "--old-group-format=%<", "--new-group-format=%>", IO::NULL, IO::NULL], err: %i[child out], &:read) + unless $?.success? + abort "#{$0}: #{diff} -D does not work" + end + warn "Avoiding diffutils 3.8 bug#61193" + proc {|macro, *inputs| + [diff] + [ + "--old-group-format=" \ + "#ifndef @\n" \ + "%<" \ + "#endif /* ! @ */\n", + + "--new-group-format=" \ + "#ifdef @\n" \ + "%>" \ + "#endif /* @ */\n", + + "--changed-group-format=" \ + "#ifndef @\n" \ + "%<" \ + "#else /* @ */\n" \ + "%>" \ + "#endif /* @ */\n" + ].map {|opt| opt.gsub(/@/) {macro}} + inputs + } + end + end + + ifdef = diff_args(diff || "diff") + NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p] fds = [] @@ -556,9 +597,8 @@ if header IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)} end while syms.pop fds.each(&:close) - ff = nil - IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age| - IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f| + IO.popen(ifdef["USE_UNICODE_AGE_PROPERTIES", fds[1].path, fds[0].path], "r") {|age| + IO.popen(ifdef["USE_UNICODE_PROPERTIES", fds[2].path, "-"], "r", in: age) {|f| ansi = false f.each {|line| if /ANSI-C code produced by gperf/ =~ line @@ -567,7 +607,7 @@ if header line.sub!(/\/\*ANSI\*\//, '1') if ansi line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/, 'uniname2ctype_offset(\1), ') - if ff = (!ff ? /^(uniname2ctype_hash) /=~line : /^\}/!~line) # no line can match both, exclusive flip-flop + if line.start_with?("uniname2ctype_hash\s") ... line.start_with?("}") line.sub!(/^( *(?:register\s+)?(.*\S)\s+hval\s*=\s*)(?=len;)/, '\1(\2)') end puts line |