summaryrefslogtreecommitdiff
path: root/enc/unicode/case-folding.rb
diff options
context:
space:
mode:
authornobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-05-30 23:49:54 +0000
committernobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-05-30 23:49:54 +0000
commit0148bd15e4928582adebc4afe4e18db30b68a5a6 (patch)
treefe08705855c410d8bd15f4b69f22c0e987e259c8 /enc/unicode/case-folding.rb
parent40ec5528612ad426fac80b8b0ea97009fec7458d (diff)
case-folding.rb: conversion script
* enc/unicode/case-folding.rb: script to convert CaseFolding.txt, tranlated from CaseFolding.py. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46266 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc/unicode/case-folding.rb')
-rwxr-xr-xenc/unicode/case-folding.rb107
1 files changed, 107 insertions, 0 deletions
diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb
new file mode 100755
index 0000000000..e39bef20f8
--- /dev/null
+++ b/enc/unicode/case-folding.rb
@@ -0,0 +1,107 @@
+#!/usr/bin/ruby
+
+# Usage:
+# $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
+# $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h
+
+
+def hex_seq(v)
+ v.map {|i| "0x%04x" % i}.join(", ")
+end
+
+def print_table(table, data)
+ print("static const #{table}[] = {\n")
+ for k, v in data.sort
+ if Array === k and k.length > 1
+ sk = "{#{hex_seq(k)}}"
+ else
+ sk = "0x%04x" % k
+ end
+ print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
+ end
+ print("};\n\n")
+end
+
+def print_case_folding_data(filename)
+ pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/
+
+ fold = {}
+ unfold = [{}, {}, {}]
+ turkic = []
+
+ IO.foreach(filename) do |line|
+ next unless res = pattern.match(line)
+ ch_from = res[1].to_i(16)
+ ch_to = []
+
+ if res[2] == 'T'
+ # Turkic case folding
+ turkic << ch_from
+ next
+ end
+
+ # store folding data
+ (3..6).each do |i|
+ if res[i]
+ ch_to << res[i].to_i(16)
+ end
+ end
+ fold[ch_from] = ch_to
+
+ # store unfolding data
+ i = ch_to.length - 1
+ (unfold[i][ch_to] ||= []) << ch_from
+ end
+
+ # move locale dependent data to (un)fold_locale
+ fold_locale = {}
+ unfold_locale = [{}, {}]
+ for ch_from in turkic
+ key = fold[ch_from]
+ i = key.length - 1
+ unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
+ fold_locale[ch_from] = fold.delete(ch_from)
+ end
+
+ # print the header
+ print("/* DO NOT EDIT THIS FILE. */\n")
+ print("/* Generated by tool/CaseFolding.py */\n\n")
+
+ # print folding data
+
+ # CaseFold
+ print_table("CaseFold_11_Type CaseFold", fold)
+
+ # CaseFold_Locale
+ print_table("CaseFold_11_Type CaseFold_Locale", fold_locale)
+
+ # print unfolding data
+
+ # CaseUnfold_11
+ print_table("CaseUnfold_11_Type CaseUnfold_11", unfold[0])
+
+ # CaseUnfold_11_Locale
+ print_table("CaseUnfold_11_Type CaseUnfold_11_Locale", unfold_locale[0])
+
+ # CaseUnfold_12
+ print_table("CaseUnfold_12_Type CaseUnfold_12", unfold[1])
+
+ # CaseUnfold_12_Locale
+ print_table("CaseUnfold_12_Type CaseUnfold_12_Locale", unfold_locale[1])
+
+ # CaseUnfold_13
+ print_table("CaseUnfold_13_Type CaseUnfold_13", unfold[2])
+
+ # table sizes
+ fold_table_size = fold.size + fold_locale.size
+ printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
+ unfold1_table_size = unfold[0].size + unfold_locale[0].size
+ printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
+ unfold2_table_size = unfold[1].size + unfold_locale[1].size
+ printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5))
+ unfold3_table_size = unfold[2].size
+ printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7))
+end
+
+filename = ARGV[0] || 'CaseFolding.txt'
+print_case_folding_data(filename)