summaryrefslogtreecommitdiff
path: root/enc/unicode
diff options
context:
space:
mode:
Diffstat (limited to 'enc/unicode')
-rwxr-xr-xenc/unicode/case-folding.rb28
-rw-r--r--enc/unicode/casefold.h137
2 files changed, 154 insertions, 11 deletions
diff --git a/enc/unicode/case-folding.rb b/enc/unicode/case-folding.rb
index a877100709..682e86662d 100755
--- a/enc/unicode/case-folding.rb
+++ b/enc/unicode/case-folding.rb
@@ -78,12 +78,17 @@ class CaseFolding
self
end
+ def range_check(code)
+ "#{code} <= MAX_CODE_VALUE && #{code} >= MIN_CODE_VALUE"
+ end
+
def lookup_hash(key, type, data)
hash = "onigenc_unicode_#{key}_hash"
lookup = "onigenc_unicode_#{key}_lookup"
- gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
- argname = "code"
- argdecl = "const OnigCodePoint #{argname}"
+ arity = Array(data[0][0]).size
+ gperf = %W"gperf -7 -k#{[*1..(arity*3)].join(",")} -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
+ argname = arity > 1 ? "codes" : "code"
+ argdecl = "const OnigCodePoint #{arity > 1 ? "*": ""}#{argname}"
n = 7
m = (1 << n) - 1
min, max = data.map {|c, *|c}.flatten.minmax
@@ -101,17 +106,21 @@ class CaseFolding
src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
- body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
+ body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_#{arity > 1 ? 'at' : 'of'}(#{argname}, \\1)")
"#{name}(#{argdecl})\n{\n#{body}}"
}
src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
- body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
+ body.gsub!(/(#{hash})\s*\(.*?\)/, "\\1(#{argname})")
body.gsub!(/\{"",-1}/, "-1")
body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
- body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
+ body.sub!(/(\s+if\s)\(len\b.*\)/) do
+ "#$1(" <<
+ (arity > 1 ? (0...arity).map {|i| range_check("#{argname}[#{i}]")}.join(" &&\n ") : range_check(argname)) <<
+ ")"
+ end
v = nil
body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
pre = $1
@@ -119,7 +128,7 @@ class CaseFolding
s = $3
s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
v = $1
- s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
+ s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code#{arity}_equal(#{argname}, #{key}_Table[#{v}].from))")
"#{pre}{#{s}\n#{indent}}"
}
body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
@@ -149,15 +158,14 @@ class CaseFolding
# CaseUnfold_12 + CaseUnfold_12_Locale
name = "CaseUnfold_12"
- print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
+ data = print_table(dest, name, name=>unfold[1], "#{name}_Locale"=>unfold_locale[1])
+ dest.print lookup_hash(name, "CodePointList2", data)
# CaseUnfold_13
name = "CaseUnfold_13"
print_table(dest, name, name=>unfold[2])
# table sizes
- unfold2_table_size = unfold[1].size + unfold_locale[1].size
- dest.printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5))
unfold3_table_size = unfold[2].size
dest.printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7))
end
diff --git a/enc/unicode/casefold.h b/enc/unicode/casefold.h
index 8647e5e8a6..02d742a76d 100644
--- a/enc/unicode/casefold.h
+++ b/enc/unicode/casefold.h
@@ -5022,6 +5022,142 @@ static const CaseUnfold_12_Type CaseUnfold_12_Table[] = {
{{0x0069, 0x0307}, {1, {0x0130}}},
};
+/* C code produced by gperf version 3.0.4 */
+/* Command-line: gperf -7 -k1,2,3,4,5,6 -F,-1 -c -j1 -i1 -t -T -E -C -H onigenc_unicode_CaseUnfold_12_hash -N onigenc_unicode_CaseUnfold_12_lookup */
+
+/* maximum key range = 71, duplicates = 0 */
+
+#if (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || defined(__cplusplus) || defined(__GNUC_STDC_INLINE__)
+inline
+#elif defined(__GNUC__)
+__inline
+#endif
+/*ARGSUSED*/
+static unsigned int
+onigenc_unicode_CaseUnfold_12_hash(const OnigCodePoint *codes)
+{
+ static const unsigned char asso_values[] =
+ {
+ 3, 58, 54, 57, 56, 16, 8, 2, 43, 82,
+ 3, 1, 23, 82, 82, 82, 82, 82, 82, 4,
+ 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
+ 82, 82, 52, 51, 50, 49, 48, 47, 46, 45,
+ 82, 82, 82, 82, 43, 82, 42, 82, 82, 13,
+ 82, 82, 82, 82, 82, 11, 82, 1, 82, 82,
+ 14, 82, 1, 82, 82, 31, 3, 82, 82, 30,
+ 82, 82, 82, 10, 82, 82, 82, 82, 37, 82,
+ 82, 82, 82, 82, 82, 82, 82, 82, 82, 82,
+ 82, 82, 82, 82, 82, 82, 37, 15, 36, 35,
+ 34, 17, 1, 33, 12, 4, 23, 23, 26, 21,
+ 13, 82, 27, 82, 82, 2, 5, 82, 11, 16,
+ 82, 15, 82, 82, 23, 82, 8, 82
+ };
+ return asso_values[bits_at(codes, 5)] + asso_values[bits_at(codes, 4)] + asso_values[bits_at(codes, 3)] + asso_values[bits_at(codes, 2)] + asso_values[bits_at(codes, 1)] + asso_values[bits_at(codes, 0)];
+}
+
+#ifdef __GNUC__
+__inline
+#if defined __GNUC_STDC_INLINE__ || defined __GNUC_GNU_INLINE__
+__attribute__ ((__gnu_inline__))
+#endif
+#endif
+static const CodePointList2 *
+onigenc_unicode_CaseUnfold_12_lookup(const OnigCodePoint *codes)
+{
+ enum
+ {
+ MIN_CODE_VALUE = 0x61,
+ MAX_CODE_VALUE = 0x1f7c,
+ TOTAL_KEYWORDS = 59,
+ MIN_WORD_LENGTH = 6,
+ MAX_WORD_LENGTH = 6,
+ MIN_HASH_VALUE = 11,
+ MAX_HASH_VALUE = 81
+ };
+
+ static const short wordlist[] =
+ {
+ -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1,
+ /*0x1f66,0x03b9*/ 53,
+ /*0x1f07,0x03b9*/ 38,
+ /*0x1f00,0x03b9*/ 31,
+ /*0x0066,0x0066*/ 1,
+ /*0x1f74,0x03b9*/ 56,
+ /*0x0073,0x0073*/ 6,
+ /*0x0066,0x0069*/ 2,
+ /*0x1f06,0x03b9*/ 37,
+ /*0x0073,0x0074*/ 7,
+ /*0x03b9,0x0342*/ 18,
+ /*0x03c9,0x03b9*/ 23,
+ /*0x03b7,0x03b9*/ 17,
+ /*0x0069,0x0307*/ 58,
+ /*0x03b1,0x03b9*/ 15,
+ /*0x1f61,0x03b9*/ 48,
+ /*0x1f05,0x03b9*/ 36,
+ /*0x1f65,0x03b9*/ 52,
+ /*0x0574,0x0576*/ 29,
+ /*0x03c9,0x0342*/ 22,
+ /*0x03b7,0x0342*/ 16,
+ /*0x057e,0x0576*/ 30,
+ /*0x03b1,0x0342*/ 14,
+ /*0x1f7c,0x03b9*/ 57,
+ /*0x0574,0x0565*/ 26,
+ /*0x0079,0x030a*/ 10,
+ /*0x0077,0x030a*/ 9,
+ /*0x1f70,0x03b9*/ 55,
+ /*0x0574,0x056d*/ 28,
+ /*0x0066,0x006c*/ 3,
+ /*0x0574,0x056b*/ 27,
+ /*0x0061,0x02be*/ 0,
+ /*0x0068,0x0331*/ 4,
+ /*0x1f67,0x03b9*/ 54,
+ /*0x1f64,0x03b9*/ 51,
+ /*0x1f63,0x03b9*/ 50,
+ /*0x1f62,0x03b9*/ 49,
+ /*0x1f60,0x03b9*/ 47,
+ /*0x03ce,0x03b9*/ 24,
+ /*0x03c5,0x0342*/ 21,
+ /*0x03c5,0x0313*/ 20,
+ /*0x03c1,0x0313*/ 19,
+ /*0x02bc,0x006e*/ 11,
+ /*0x03ae,0x03b9*/ 13,
+ /*0x03ac,0x03b9*/ 12,
+ /*0x1f27,0x03b9*/ 46,
+ /*0x1f26,0x03b9*/ 45,
+ /*0x1f25,0x03b9*/ 44,
+ /*0x1f24,0x03b9*/ 43,
+ /*0x1f23,0x03b9*/ 42,
+ /*0x1f22,0x03b9*/ 41,
+ /*0x1f21,0x03b9*/ 40,
+ /*0x1f20,0x03b9*/ 39,
+ /*0x006a,0x030c*/ 5,
+ /*0x1f02,0x03b9*/ 33,
+ /*0x0074,0x0308*/ 8,
+ /*0x1f04,0x03b9*/ 35,
+ /*0x1f03,0x03b9*/ 34,
+ /*0x1f01,0x03b9*/ 32,
+ -1, -1, -1, -1, -1, -1,
+ -1, -1, -1, -1, -1, -1,
+ /*0x0565,0x0582*/ 25
+ };
+
+ if (codes[0] <= MAX_CODE_VALUE && codes[0] >= MIN_CODE_VALUE &&
+ codes[1] <= MAX_CODE_VALUE && codes[1] >= MIN_CODE_VALUE)
+ {
+ register int key = onigenc_unicode_CaseUnfold_12_hash(codes);
+
+ if (key <= MAX_HASH_VALUE && key >= 0)
+ {
+ register short s = wordlist[key];
+
+ if (s >= 0 && code2_equal(codes, CaseUnfold_12_Table[s].from))
+ return &CaseUnfold_12_Table[s].to;
+ }
+ }
+ return 0;
+}
+
static const CaseUnfold_13_Type CaseUnfold_13_Table[] = {
#define CaseUnfold_13 (*(CaseUnfold_13_Type (*)[14])(CaseUnfold_13_Table+0))
{{0x0066, 0x0066, 0x0069}, {1, {0xfb03}}},
@@ -5040,5 +5176,4 @@ static const CaseUnfold_13_Type CaseUnfold_13_Table[] = {
{{0x03c9, 0x0342, 0x03b9}, {1, {0x1ff7}}},
};
-#define UNFOLD2_TABLE_SIZE 88
#define UNFOLD3_TABLE_SIZE 23