summaryrefslogtreecommitdiff
path: root/enc/unicode/case-folding.rb
diff options
context:
space:
mode:
authornobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-05-30 23:57:45 +0000
committernobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-05-30 23:57:45 +0000
commitc39e659263b9a988c39ff97aca3ffde9a482e4e4 (patch)
tree81778db8f4ee13b6e1933a1c88227074b7e87dcf /enc/unicode/case-folding.rb
parent88eae35862f3b228443f116234cbf09057c361c8 (diff)
case-folding.rb: perfect hash for case folding
* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to lookup case folding table. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46269 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc/unicode/case-folding.rb')
-rwxr-xr-xenc/unicode/case-folding.rb62
1 files changed, 56 insertions, 6 deletions
diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb
index f1faaebf05..0ccc822d39 100755
--- a/enc/unicode/case-folding.rb
+++ b/enc/unicode/case-folding.rb
@@ -13,21 +13,23 @@ class CaseFolding
end
def print_table_1(dest, data)
- for k, v in data.sort
+ for k, v in data = data.sort
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
dest.print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
end
+ data
end
def print_table(dest, type, data)
dest.print("static const #{type}_Type #{type}_Table[] = {\n")
i = 0
- data.each do |n, d|
+ ret = data.inject([]) do |a, (n, d)|
dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
i += d.size
- print_table_1(dest, d)
+ a.concat(print_table_1(dest, d))
end
dest.print("};\n\n")
+ ret
end
end
@@ -76,6 +78,55 @@ class CaseFolding
self
end
+ def lookup_hash(key, type, data)
+ hash = "onigenc_unicode_#{key}_hash"
+ lookup = "onigenc_unicode_#{key}_lookup"
+ gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
+ argname = "code"
+ argdecl = "const OnigCodePoint #{argname}"
+ n = 7
+ m = (1 << n) - 1
+ min, max = data.map {|c, *|c}.minmax
+ src = IO.popen(gperf, "r+") {|f|
+ f << "short\n%%\n"
+ data.each_with_index {|(k, _), i|
+ ks = [(k >> n*2) & m, (k >> n) & m, (k) & m].map {|c| "\\x%.2x" % c}.join("")
+ f.printf "\"%s\", ::::/*0x%.4x*/ %d\n", ks, k, i
+ }
+ f << "%%\n"
+ f.close_write
+ f.read
+ }
+ src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
+ name = $1
+ body = $2
+ body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
+ "#{name}(#{argdecl})\n{\n#{body}}"
+ }
+ src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
+ name = $1
+ body = $2
+ body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
+ body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
+ body.gsub!(/\{"",-1}/, "-1")
+ body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
+ body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
+ v = nil
+ body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
+ pre = $1
+ indent = $2
+ s = $3
+ s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
+ v = $1
+ s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
+ "#{pre}{#{s}\n#{indent}}"
+ }
+ body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
+ "static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
+ }
+ src
+ end
+
def display(dest)
# print the header
dest.print("/* DO NOT EDIT THIS FILE. */\n")
@@ -85,7 +136,8 @@ class CaseFolding
# CaseFold + CaseFold_Locale
name = "CaseFold_11"
- print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
+ data = print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
+ dest.print lookup_hash(name, "CodePointList3", data)
# print unfolding data
@@ -102,8 +154,6 @@ class CaseFolding
print_table(dest, name, name=>unfold[2])
# table sizes
- fold_table_size = fold.size + fold_locale.size
- dest.printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
unfold1_table_size = unfold[0].size + unfold_locale[0].size
dest.printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
unfold2_table_size = unfold[1].size + unfold_locale[1].size