1 files changed, 86 insertions, 46 deletions
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb
index 93f6e869f8..9d49f427bb 100755
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@@ -5,14 +5,30 @@
 #
 # To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
 # PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
-# DerivedAge.txt and Blocks.txt  from unicode.org.
+# DerivedAge.txt, Blocks.txt, emoji/emoji-data.txt,
+# auxiliary/GraphemeBreakProperty.txt from unicode.org
 # (http://unicode.org/Public/UNIDATA/) And run following command.
-# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
+# tool/enc-unicode.rb data_dir emoji_data_dir > enc/unicode/name2ctype.kwd
 # You can get source file for gperf.  After this, simply make ruby.
-
-if ARGV[0] == "--header"
-  header = true
-  ARGV.shift
+# Or directly run:
+# tool/enc-unicode.rb --header data_dir emoji_data_dir > enc/unicode/<VERSION>/name2ctype.h
+
+while arg = ARGV.shift
+  case arg
+  when "--"
+    break
+  when "--header"
+    header = true
+  when "--diff"
+    diff = ARGV.shift or abort "#{$0}: --diff=DIFF-COMMAND"
+  when /\A--diff=(.+)/m
+    diff = $1
+  when /\A-/
+    abort "#{$0}: unknown option #{arg}"
+  else
+    ARGV.unshift(arg)
+    break
+  end
 end
 unless ARGV.size == 2
   abort "Usage: #{$0} data_directory emoji_data_directory"
@@ -59,7 +75,7 @@ def parse_unicode_data(file)
   data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
     'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
   beg_cp = nil
-  IO.foreach(file) do |line|
+  File.foreach(file) do |line|
     fields = line.split(';')
     cp = fields[0].to_i(16)
 
@@ -150,7 +166,7 @@ def parse_scripts(data, categories)
         categories[current] = file[:title]
         (names[file[:title]] ||= []) << current
         cps = []
-      elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+      elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line
         current = $3
         $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
       end
@@ -205,7 +221,7 @@ def parse_age(data)
       ages << current
       last_constname = constname
       cps = []
-    elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
+    elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\d+\.\d+)/ =~ line
       current = $3
       $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
     end
@@ -224,7 +240,7 @@ def parse_GraphemeBreakProperty(data)
       make_const(constname, cps, "Grapheme_Cluster_Break=#{current}")
       ages << current
       cps = []
-    elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+    elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line
       current = $3
       $2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
     end
@@ -236,7 +252,7 @@ def parse_block(data)
   cps = []
   blocks = []
   data_foreach('Blocks.txt') do |line|
-    if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
+    if /^(\h+)\.\.(\h+);\s*(.*)/ =~ line
       cps = ($1.to_i(16)..$2.to_i(16)).to_a
       constname = constantize_blockname($3)
       data[constname] = cps
@@ -253,23 +269,12 @@ def parse_block(data)
   blocks << constname
 end
 
-# shim for Ruby 1.8
-unless {}.respond_to?(:key)
-  class Hash
-    alias key index
-  end
-end
-
 $const_cache = {}
 # make_const(property, pairs, name): Prints a 'static const' structure for a
 # given property, group of paired codepoints, and a human-friendly name for
 # the group
 def make_const(prop, data, name)
-  if name.empty?
-    puts "\n/* '#{prop}' */"
-  else
-    puts "\n/* '#{prop}': #{name} */"
-  end
+  puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation
   if origprop = $const_cache.key(data)
     puts "#define CR_#{prop} CR_#{origprop}"
   else
@@ -311,18 +316,19 @@ end
 def data_foreach(name, &block)
   fn = get_file(name)
   warn "Reading #{name}"
-  if /^emoji/ =~ name
-    sep = ""
-    pat = /^# #{Regexp.quote(File.basename(name))}.*^# Version: ([\d.]+)/m
-    type = :Emoji
-  else
-    sep = "\n"
-    pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/
-    type = :Unicode
-  end
   File.open(fn, 'rb') do |f|
-    line = f.gets(sep)
-    unless version = line[pat, 1]
+    if /^emoji/ =~ name
+      line = f.gets("")
+      # Headers till Emoji 13 or 15
+      version = line[/^# #{Regexp.quote(File.basename(name))}.*(?:^# Version:|Emoji Version) ([\d.]+)/m, 1]
+      type = :Emoji
+    else
+      # Headers since Emoji 14 or other Unicode data
+      line = f.gets("\n")
+      type = :Unicode
+    end
+    version ||= line[/^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/, 1]
+    unless version
       raise ArgumentError, <<-ERROR
 #{name}: no #{type} version
 #{line.gsub(/^/, '> ')}
@@ -330,7 +336,7 @@ def data_foreach(name, &block)
     end
     if !(v = $versions[type])
       $versions[type] = version
-    elsif v != version
+    elsif v != version and "#{v}.0" != version
       raise ArgumentError, <<-ERROR
 #{name}: #{type} version mismatch: #{version} to #{v}
 #{line.gsub(/^/, '> ')}
@@ -420,8 +426,6 @@ define_posix_props(data)
 POSIX_NAMES.each do |name|
   if name == 'XPosixPunct'
     make_const(name, data[name], "[[:Punct:]]")
-  elsif name == 'Punct'
-    make_const(name, data[name], "")
   else
     make_const(name, data[name], "[[:#{name}:]]")
   end
@@ -464,11 +468,7 @@ struct uniname2ctype_struct {
 };
 #define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str)
 
-static const struct uniname2ctype_struct *uniname2ctype_p(
-#if !(/*ANSI*/+0) /* if ANSI, old style not to conflict with generated prototype */
-    const char *, unsigned int
-#endif
-);
+static const struct uniname2ctype_struct *uniname2ctype_p(register const char *str, register size_t len);
 %}
 struct uniname2ctype_struct;
 %%
@@ -547,6 +547,47 @@ output.restore
 if header
   require 'tempfile'
 
+  def diff_args(diff)
+    ok = IO.popen([diff, "-DDIFF_TEST", IO::NULL, "-"], "r+") do |f|
+      f.puts "Test for diffutils 3.8"
+      f.close_write
+      /^#if/ =~ f.read
+    end
+    if ok
+      proc {|macro, *inputs|
+        [diff, "-D#{macro}", *inputs]
+      }
+    else
+      IO.popen([diff, "--old-group-format=%<", "--new-group-format=%>", IO::NULL, IO::NULL], err: %i[child out], &:read)
+      unless $?.success?
+        abort "#{$0}: #{diff} -D does not work"
+      end
+      warn "Avoiding diffutils 3.8 bug#61193"
+      proc {|macro, *inputs|
+        [diff] + [
+          "--old-group-format=" \
+	  "#ifndef @\n" \
+	  "%<" \
+	  "#endif /* ! @ */\n",
+
+          "--new-group-format=" \
+	  "#ifdef @\n" \
+	  "%>" \
+	  "#endif /* @ */\n",
+
+	  "--changed-group-format=" \
+	  "#ifndef @\n" \
+	  "%<" \
+	  "#else /* @ */\n" \
+	  "%>" \
+	  "#endif /* @ */\n"
+        ].map {|opt| opt.gsub(/@/) {macro}} + inputs
+      }
+    end
+  end
+
+  ifdef = diff_args(diff || "diff")
+
   NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p]
 
   fds = []
@@ -556,9 +597,8 @@ if header
     IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)}
   end while syms.pop
   fds.each(&:close)
-  ff = nil
-  IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age|
-    IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f|
+  IO.popen(ifdef["USE_UNICODE_AGE_PROPERTIES", fds[1].path, fds[0].path], "r") {|age|
+    IO.popen(ifdef["USE_UNICODE_PROPERTIES", fds[2].path, "-"], "r", in: age) {|f|
       ansi = false
       f.each {|line|
         if /ANSI-C code produced by gperf/ =~ line
@@ -567,7 +607,7 @@ if header
         line.sub!(/\/\*ANSI\*\//, '1') if ansi
         line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/,
                    'uniname2ctype_offset(\1), ')
-        if ff = (!ff ? /^(uniname2ctype_hash) /=~line : /^\}/!~line) # no line can match both, exclusive flip-flop
+        if line.start_with?("uniname2ctype_hash\s") ... line.start_with?("}")
           line.sub!(/^( *(?:register\s+)?(.*\S)\s+hval\s*=\s*)(?=len;)/, '\1(\2)')
         end
         puts line