summaryrefslogtreecommitdiff
path: root/enc/unicode/case-folding.rb
blob: e39bef20f8dea38b2c91fb30fa9b57c1b00b7f3d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/ruby

# Usage:
#   $ wget http://www.unicode.org/Public/UNIDATA/CaseFolding.txt
#   $ ruby CaseFolding.rb CaseFolding.txt > ../enc/unicode/casefold.h


def hex_seq(v)
  v.map {|i| "0x%04x" % i}.join(", ")
end

def print_table(table, data)
    print("static const #{table}[] = {\n")
    for k, v in data.sort
      if Array === k and k.length > 1
        sk = "{#{hex_seq(k)}}"
      else
        sk = "0x%04x" % k
      end
      print("  {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
    end
    print("};\n\n")
end

def print_case_folding_data(filename)
  pattern = /([0-9A-F]{4,6}); ([CFT]); ([0-9A-F]{4,6})(?: ([0-9A-F]{4,6}))?(?: ([0-9A-F]{4,6}))?;/

  fold = {}
  unfold = [{}, {}, {}]
  turkic = []

  IO.foreach(filename) do |line|
    next unless res = pattern.match(line)
    ch_from = res[1].to_i(16)
    ch_to = []

    if res[2] == 'T'
      # Turkic case folding
      turkic << ch_from
      next
    end

    # store folding data
    (3..6).each do |i|
      if res[i]
        ch_to << res[i].to_i(16)
      end
    end
    fold[ch_from] = ch_to

    # store unfolding data
    i = ch_to.length - 1
    (unfold[i][ch_to] ||= []) << ch_from
  end

  # move locale dependent data to (un)fold_locale
  fold_locale = {}
  unfold_locale = [{}, {}]
  for ch_from in turkic
    key = fold[ch_from]
    i = key.length - 1
    unfold_locale[i][i == 0 ? key[0] : key] = unfold[i].delete(key)
    fold_locale[ch_from] = fold.delete(ch_from)
  end

  # print the header
  print("/* DO NOT EDIT THIS FILE. */\n")
  print("/* Generated by tool/CaseFolding.py */\n\n")

  # print folding data

  # CaseFold
  print_table("CaseFold_11_Type CaseFold", fold)

  # CaseFold_Locale
  print_table("CaseFold_11_Type CaseFold_Locale", fold_locale)

  # print unfolding data

  # CaseUnfold_11
  print_table("CaseUnfold_11_Type CaseUnfold_11", unfold[0])

  # CaseUnfold_11_Locale
  print_table("CaseUnfold_11_Type CaseUnfold_11_Locale", unfold_locale[0])

  # CaseUnfold_12
  print_table("CaseUnfold_12_Type CaseUnfold_12", unfold[1])

  # CaseUnfold_12_Locale
  print_table("CaseUnfold_12_Type CaseUnfold_12_Locale", unfold_locale[1])

  # CaseUnfold_13
  print_table("CaseUnfold_13_Type CaseUnfold_13", unfold[2])

  # table sizes
  fold_table_size = fold.size + fold_locale.size
  printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
  unfold1_table_size = unfold[0].size + unfold_locale[0].size
  printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
  unfold2_table_size = unfold[1].size + unfold_locale[1].size
  printf("#define UNFOLD2_TABLE_SIZE\t%d\n", (unfold2_table_size * 1.5))
  unfold3_table_size = unfold[2].size
  printf("#define UNFOLD3_TABLE_SIZE\t%d\n", (unfold3_table_size * 1.7))
end

filename = ARGV[0] || 'CaseFolding.txt'
print_case_folding_data(filename)