summaryrefslogtreecommitdiff
path: root/template/unicode_norm_gen.tmpl
diff options
context:
space:
mode:
authornobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-10-25 07:20:15 +0000
committernobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2014-10-25 07:20:15 +0000
commit9b581e0d0b41dccc8c15400f05ca5c763c6c41b9 (patch)
treea1f22b735e7cf00ff41d3acf463e66513e749dd2 /template/unicode_norm_gen.tmpl
parent67a19e7a59dccbc00daed2970350a20124926afb (diff)
template/unicode_norm_gen.tmpl: from tool/unicode_norm_gen.rb
* template/unicode_norm_gen.tmpl: use generic_erb.rb to update if changed and manage timestamp, so that source tree on read-only filesystem works. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@48129 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'template/unicode_norm_gen.tmpl')
-rw-r--r--template/unicode_norm_gen.tmpl211
1 files changed, 211 insertions, 0 deletions
diff --git a/template/unicode_norm_gen.tmpl b/template/unicode_norm_gen.tmpl
new file mode 100644
index 0000000..332cb15
--- /dev/null
+++ b/template/unicode_norm_gen.tmpl
@@ -0,0 +1,211 @@
+%# -*- mode: ruby; coding: utf-8 -*-
+<%
+# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+# Script to generate Ruby data structures used in implementing
+# String#unicode_normalize,...
+
+# Constants for input and ouput directory
+InputDataDir = ARGV[0] || 'enc/unicode/data'
+OuputDataDir = ARGV[1] || 'lib/unicode_normalize'
+
+# convenience methods
+class Integer
+ def to_UTF8() # convert to string, taking legibility into account
+ if self>0xFFFF
+ "\\u{#{to_s(16).upcase}}"
+ elsif self>0x7f
+ "\\u#{to_s(16).upcase.rjust(4, '0')}"
+ else
+ chr.sub(/[\\\"]/, "\\\\\\\&")
+ end
+ end
+end
+
+module Enumerable
+ unless method_defined?(:each_slice)
+ def each_slice(n)
+ ary = []
+ each do |i|
+ ary << i
+ if ary.size >= n
+ yield ary
+ ary = []
+ end
+ end
+ yield ary unless ary.empty?
+ self
+ end
+ end
+end
+
+class Array
+ def to_UTF8() collect {|c| c.to_UTF8}.join('') end
+
+ def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
+ sort.inject([]) do |ranges, value|
+ if ranges.last and ranges.last[1]+1>=value
+ ranges.last[1] = value
+ ranges
+ else
+ ranges << [value, value]
+ end
+ end.collect do |first, last|
+ case last-first
+ when 0
+ first.to_UTF8
+ when 1
+ first.to_UTF8 + last.to_UTF8
+ else
+ first.to_UTF8 + '-' + last.to_UTF8
+ end
+ end.each_slice(n) do |slice|
+ yield slice.join('')
+ end
+ end
+end
+
+# read the file 'CompositionExclusions.txt'
+composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
+ f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex}
+}
+
+decomposition_table = {}
+kompatible_table = {}
+combining_class = {} # constant to allow use in Integer#to_UTF8
+
+# read the file 'UnicodeData.txt'
+vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
+ codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
+
+ case decomposition
+ when /^[0-9A-F]/
+ decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
+ when /^</
+ kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
+ end
+ combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
+
+ if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
+ warn "Unexpected: Character range with data relevant to normalization!"
+ end
+end
+
+# calculate compositions from decompositions
+composition_table = decomposition_table.reject do |character, decomposition|
+ composition_exclusions.member? character or # predefined composition exclusion
+ decomposition.length<=1 or # Singleton Decomposition
+ combining_class[character] or # character is not a Starter
+ combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
+end.invert
+
+# recalculate composition_exclusions
+composition_exclusions = decomposition_table.keys - composition_table.values
+
+accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
+
+composition_starters = composition_table.keys.collect {|key| key.first}
+
+hangul_no_trailing = []
+0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
+
+# expand decomposition table values
+decomposition_table.each do |key, value|
+ position = 0
+ while position < value.length
+ if decomposition = decomposition_table[value[position]]
+ decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
+ value[position, 1] = decomposition
+ else
+ position += 1
+ end
+ end
+end
+
+# deal with relationship between canonical and kompatibility decompositions
+decomposition_table.each do |key, value|
+ value = value.dup
+ expanded = false
+ position = 0
+ while position < value.length
+ if decomposition = kompatible_table[value[position]]
+ value[position, 1] = decomposition
+ expanded = true
+ else
+ position += 1
+ end
+ end
+ kompatible_table[key] = value if expanded
+end
+
+# generate normalization tables file
+%># coding: us-ascii
+%# >
+
+# automatically generated by tool/unicode_norm_gen.rb
+
+module UnicodeNormalize
+ accents = "" \
+ "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>]" \
+ "".freeze
+ ACCENTS = accents
+ REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
+ }" \
+ "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>]#{accents}*" \
+ "|#{'' # characters that can be the result of a composition, except composition starters
+ }" \
+ "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>]?#{accents}+" \
+ "|#{'' # precomposed Hangul syllables
+ }" \
+ "[\u{AC00}-\u{D7A4}]" \
+ "".freeze
+ REGEXP_C_STRING = "#{'' # composition exclusions
+ }" \
+ "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>]#{accents}*" \
+ "|#{'' # composition starters and characters that can be the result of a composition
+ }" \
+ "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>]?#{accents}+" \
+ "|#{'' # Hangul syllables with separate trailer
+ }" \
+ "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
+ "<% end%>][\u11A8-\u11C2]" \
+ "|#{'' # decomposed Hangul syllables
+ }" \
+ "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \
+ "".freeze
+ REGEXP_K_STRING = "" \
+ "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
+ "<%end%>]" \
+ "".freeze
+
+ class_table = {
+% combining_class.each_slice(8) do |slice|
+ <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
+% end
+ }
+ class_table.default = 0
+ CLASS_TABLE = class_table.freeze
+
+ DECOMPOSITION_TABLE = {
+% decomposition_table.each_slice(8) do |slice|
+ <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+ }.freeze
+
+ KOMPATIBLE_TABLE = {
+% kompatible_table.each_slice(8) do |slice|
+ <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+ }.freeze
+
+ COMPOSITION_TABLE = {
+% composition_table.each_slice(8) do |slice|
+ <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
+% end
+ }.freeze
+end