summaryrefslogtreecommitdiff
path: root/tool/enc-unicode.rb
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-10-08 02:49:11 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2009-10-08 02:49:11 +0000
commit866c79e2de4567d71f432652c58b48fe50916f37 (patch)
tree799d6e168abeaa5babf61a3e42ab5f0ab9b2094c /tool/enc-unicode.rb
parentec0e370eb5451a1e597bf528f8f9a2dcc46880f0 (diff)
* tool/enc-unicode.rb: parse range notation of UnicodeData.txt.
* enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt, enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src: follow above change. [ruby-dev:39444] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@25260 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'tool/enc-unicode.rb')
-rwxr-xr-xtool/enc-unicode.rb20
1 files changed, 15 insertions, 5 deletions
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb
index 8429bcc..57edb3b 100755
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@@ -40,26 +40,36 @@ end
def parse_unicode_data(file)
last_cp = 0
data = {'Cn' => []}
+ beg_cp = nil
IO.foreach(file) do |line|
fields = line.split(';')
cp = fields[0].to_i(16)
+ case fields[1]
+ when /\A<(.*),\s*First>\z/
+ beg_cp = cp
+ next
+ when /\A<(.*),\s*Last>\z/
+ cps = (beg_cp..cp).to_a
+ else
+ beg_cp = cp
+ cps = [cp]
+ end
+
# The Cn category represents unassigned characters. These are not listed in
# UnicodeData.txt so we must derive them by looking for 'holes' in the range
# of listed codepoints. We increment the last codepoint seen and compare it
# with the current codepoint. If the current codepoint is less than
# last_cp.next we have found a hole, so we add the missing codepoint to the
# Cn category.
- while ((last_cp = last_cp.next) < cp)
- data['Cn'] << last_cp
- end
+ data['Cn'].concat((last_cp.next...beg_cp).to_a)
# The third field denotes the 'General' category, e.g. Lu
- (data[fields[2]] ||= []) << cp
+ (data[fields[2]] ||= []).concat(cps)
# The 'Major' category is the first letter of the 'General' category, e.g.
# 'Lu' -> 'L'
- (data[fields[2][0,1]] ||= []) << cp
+ (data[fields[2][0,1]] ||= []).concat(cps)
last_cp = cp
end