summaryrefslogtreecommitdiff
path: root/tool/enc-unicode.rb
diff options
context:
space:
mode:
Diffstat (limited to 'tool/enc-unicode.rb')
-rwxr-xr-xtool/enc-unicode.rb132
1 files changed, 86 insertions, 46 deletions
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb
index 93f6e869f8..9d49f427bb 100755
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@@ -5,14 +5,30 @@
#
# To use this, get UnicodeData.txt, Scripts.txt, PropList.txt,
# PropertyAliases.txt, PropertyValueAliases.txt, DerivedCoreProperties.txt,
-# DerivedAge.txt and Blocks.txt from unicode.org.
+# DerivedAge.txt, Blocks.txt, emoji/emoji-data.txt,
+# auxiliary/GraphemeBreakProperty.txt from unicode.org
# (http://unicode.org/Public/UNIDATA/) And run following command.
-# ruby1.9 tool/enc-unicode.rb data_dir > enc/unicode/name2ctype.kwd
+# tool/enc-unicode.rb data_dir emoji_data_dir > enc/unicode/name2ctype.kwd
# You can get source file for gperf. After this, simply make ruby.
-
-if ARGV[0] == "--header"
- header = true
- ARGV.shift
+# Or directly run:
+# tool/enc-unicode.rb --header data_dir emoji_data_dir > enc/unicode/<VERSION>/name2ctype.h
+
+while arg = ARGV.shift
+ case arg
+ when "--"
+ break
+ when "--header"
+ header = true
+ when "--diff"
+ diff = ARGV.shift or abort "#{$0}: --diff=DIFF-COMMAND"
+ when /\A--diff=(.+)/m
+ diff = $1
+ when /\A-/
+ abort "#{$0}: unknown option #{arg}"
+ else
+ ARGV.unshift(arg)
+ break
+ end
end
unless ARGV.size == 2
abort "Usage: #{$0} data_directory emoji_data_directory"
@@ -59,7 +75,7 @@ def parse_unicode_data(file)
data = {'Any' => (0x0000..0x10ffff).to_a, 'Assigned' => [],
'ASCII' => (0..0x007F).to_a, 'NEWLINE' => [0x0a], 'Cn' => []}
beg_cp = nil
- IO.foreach(file) do |line|
+ File.foreach(file) do |line|
fields = line.split(';')
cp = fields[0].to_i(16)
@@ -150,7 +166,7 @@ def parse_scripts(data, categories)
categories[current] = file[:title]
(names[file[:title]] ||= []) << current
cps = []
- elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+ elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line
current = $3
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
end
@@ -205,7 +221,7 @@ def parse_age(data)
ages << current
last_constname = constname
cps = []
- elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\d+\.\d+)/ =~ line
+ elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\d+\.\d+)/ =~ line
current = $3
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
end
@@ -224,7 +240,7 @@ def parse_GraphemeBreakProperty(data)
make_const(constname, cps, "Grapheme_Cluster_Break=#{current}")
ages << current
cps = []
- elsif /^([0-9a-fA-F]+)(?:\.\.([0-9a-fA-F]+))?\s*;\s*(\w+)/ =~ line
+ elsif /^(\h+)(?:\.\.(\h+))?\s*;\s*(\w+)/ =~ line
current = $3
$2 ? cps.concat(($1.to_i(16)..$2.to_i(16)).to_a) : cps.push($1.to_i(16))
end
@@ -236,7 +252,7 @@ def parse_block(data)
cps = []
blocks = []
data_foreach('Blocks.txt') do |line|
- if /^([0-9a-fA-F]+)\.\.([0-9a-fA-F]+);\s*(.*)/ =~ line
+ if /^(\h+)\.\.(\h+);\s*(.*)/ =~ line
cps = ($1.to_i(16)..$2.to_i(16)).to_a
constname = constantize_blockname($3)
data[constname] = cps
@@ -253,23 +269,12 @@ def parse_block(data)
blocks << constname
end
-# shim for Ruby 1.8
-unless {}.respond_to?(:key)
- class Hash
- alias key index
- end
-end
-
$const_cache = {}
# make_const(property, pairs, name): Prints a 'static const' structure for a
# given property, group of paired codepoints, and a human-friendly name for
# the group
def make_const(prop, data, name)
- if name.empty?
- puts "\n/* '#{prop}' */"
- else
- puts "\n/* '#{prop}': #{name} */"
- end
+ puts "\n/* '#{prop}': #{name} */" # comment used to generate documentation
if origprop = $const_cache.key(data)
puts "#define CR_#{prop} CR_#{origprop}"
else
@@ -311,18 +316,19 @@ end
def data_foreach(name, &block)
fn = get_file(name)
warn "Reading #{name}"
- if /^emoji/ =~ name
- sep = ""
- pat = /^# #{Regexp.quote(File.basename(name))}.*^# Version: ([\d.]+)/m
- type = :Emoji
- else
- sep = "\n"
- pat = /^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/
- type = :Unicode
- end
File.open(fn, 'rb') do |f|
- line = f.gets(sep)
- unless version = line[pat, 1]
+ if /^emoji/ =~ name
+ line = f.gets("")
+ # Headers till Emoji 13 or 15
+ version = line[/^# #{Regexp.quote(File.basename(name))}.*(?:^# Version:|Emoji Version) ([\d.]+)/m, 1]
+ type = :Emoji
+ else
+ # Headers since Emoji 14 or other Unicode data
+ line = f.gets("\n")
+ type = :Unicode
+ end
+ version ||= line[/^# #{File.basename(name).sub(/\./, '-([\\d.]+)\\.')}/, 1]
+ unless version
raise ArgumentError, <<-ERROR
#{name}: no #{type} version
#{line.gsub(/^/, '> ')}
@@ -330,7 +336,7 @@ def data_foreach(name, &block)
end
if !(v = $versions[type])
$versions[type] = version
- elsif v != version
+ elsif v != version and "#{v}.0" != version
raise ArgumentError, <<-ERROR
#{name}: #{type} version mismatch: #{version} to #{v}
#{line.gsub(/^/, '> ')}
@@ -420,8 +426,6 @@ define_posix_props(data)
POSIX_NAMES.each do |name|
if name == 'XPosixPunct'
make_const(name, data[name], "[[:Punct:]]")
- elsif name == 'Punct'
- make_const(name, data[name], "")
else
make_const(name, data[name], "[[:#{name}:]]")
end
@@ -464,11 +468,7 @@ struct uniname2ctype_struct {
};
#define uniname2ctype_offset(str) offsetof(struct uniname2ctype_pool_t, uniname2ctype_pool_##str)
-static const struct uniname2ctype_struct *uniname2ctype_p(
-#if !(/*ANSI*/+0) /* if ANSI, old style not to conflict with generated prototype */
- const char *, unsigned int
-#endif
-);
+static const struct uniname2ctype_struct *uniname2ctype_p(register const char *str, register size_t len);
%}
struct uniname2ctype_struct;
%%
@@ -547,6 +547,47 @@ output.restore
if header
require 'tempfile'
+ def diff_args(diff)
+ ok = IO.popen([diff, "-DDIFF_TEST", IO::NULL, "-"], "r+") do |f|
+ f.puts "Test for diffutils 3.8"
+ f.close_write
+ /^#if/ =~ f.read
+ end
+ if ok
+ proc {|macro, *inputs|
+ [diff, "-D#{macro}", *inputs]
+ }
+ else
+ IO.popen([diff, "--old-group-format=%<", "--new-group-format=%>", IO::NULL, IO::NULL], err: %i[child out], &:read)
+ unless $?.success?
+ abort "#{$0}: #{diff} -D does not work"
+ end
+ warn "Avoiding diffutils 3.8 bug#61193"
+ proc {|macro, *inputs|
+ [diff] + [
+ "--old-group-format=" \
+ "#ifndef @\n" \
+ "%<" \
+ "#endif /* ! @ */\n",
+
+ "--new-group-format=" \
+ "#ifdef @\n" \
+ "%>" \
+ "#endif /* @ */\n",
+
+ "--changed-group-format=" \
+ "#ifndef @\n" \
+ "%<" \
+ "#else /* @ */\n" \
+ "%>" \
+ "#endif /* @ */\n"
+ ].map {|opt| opt.gsub(/@/) {macro}} + inputs
+ }
+ end
+ end
+
+ ifdef = diff_args(diff || "diff")
+
NAME2CTYPE = %w[gperf -7 -c -j1 -i1 -t -C -P -T -H uniname2ctype_hash -Q uniname2ctype_pool -N uniname2ctype_p]
fds = []
@@ -556,9 +597,8 @@ if header
IO.popen([*NAME2CTYPE, out: tmp], "w") {|f| output.show(f, *syms)}
end while syms.pop
fds.each(&:close)
- ff = nil
- IO.popen(%W[diff -DUSE_UNICODE_AGE_PROPERTIES #{fds[1].path} #{fds[0].path}], "r") {|age|
- IO.popen(%W[diff -DUSE_UNICODE_PROPERTIES #{fds[2].path} -], "r", in: age) {|f|
+ IO.popen(ifdef["USE_UNICODE_AGE_PROPERTIES", fds[1].path, fds[0].path], "r") {|age|
+ IO.popen(ifdef["USE_UNICODE_PROPERTIES", fds[2].path, "-"], "r", in: age) {|f|
ansi = false
f.each {|line|
if /ANSI-C code produced by gperf/ =~ line
@@ -567,7 +607,7 @@ if header
line.sub!(/\/\*ANSI\*\//, '1') if ansi
line.gsub!(/\(int\)\((?:long|size_t)\)&\(\(struct uniname2ctype_pool_t \*\)0\)->uniname2ctype_pool_(str\d+),\s+/,
'uniname2ctype_offset(\1), ')
- if ff = (!ff ? /^(uniname2ctype_hash) /=~line : /^\}/!~line) # no line can match both, exclusive flip-flop
+ if line.start_with?("uniname2ctype_hash\s") ... line.start_with?("}")
line.sub!(/^( *(?:register\s+)?(.*\S)\s+hval\s*=\s*)(?=len;)/, '\1(\2)')
end
puts line