summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog8
-rw-r--r--enc/unicode/name2ctype.h.blt8
-rw-r--r--enc/unicode/name2ctype.kwd8
-rw-r--r--enc/unicode/name2ctype.src8
-rwxr-xr-xtool/enc-unicode.rb29
5 files changed, 41 insertions, 20 deletions
diff --git a/ChangeLog b/ChangeLog
index 8a1d0b322a..8d671d556e 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,11 @@
+Fri Oct 9 02:58:18 2009 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * tool/enc-unicode.rb: optimized.
+
+ * enc/unicode/name2ctype.h, enc/unicode/name2ctype.h.blt,
+ enc/unicode/name2ctype.kwd, enc/unicode/name2ctype.src:
+ U+100000-U+10FFFD is assigned, not Cn.
+
Fri Oct 9 02:12:02 2009 Marc-Andre Lafortune <ruby-core@marc-andre.ca>
* ext/curses/curses.c: Many functions of module Curses could cause a
diff --git a/enc/unicode/name2ctype.h.blt b/enc/unicode/name2ctype.h.blt
index bbc19caf4d..9fcd60a135 100644
--- a/enc/unicode/name2ctype.h.blt
+++ b/enc/unicode/name2ctype.h.blt
@@ -3959,7 +3959,7 @@ static const OnigCodePoint CR_Any[] = {
/* 'Assigned': - */
static const OnigCodePoint CR_Assigned[] = {
- 484,
+ 485,
0x0000, 0x0377,
0x037a, 0x037e,
0x0384, 0x038a,
@@ -4444,6 +4444,7 @@ static const OnigCodePoint CR_Assigned[] = {
0xe0020, 0xe007f,
0xe0100, 0xe01ef,
0xf0000, 0xffffd,
+ 0x100000, 0x10fffd,
}; /* CR_Assigned */
/* 'C': Major Category */
@@ -4500,7 +4501,7 @@ static const OnigCodePoint CR_Cf[] = {
/* 'Cn': General Category */
static const OnigCodePoint CR_Cn[] = {
- 484,
+ 485,
0x0378, 0x0379,
0x037f, 0x0383,
0x038b, 0x038b,
@@ -4984,7 +4985,8 @@ static const OnigCodePoint CR_Cn[] = {
0xe0002, 0xe001f,
0xe0080, 0xe00ff,
0xe01f0, 0xeffff,
- 0xffffe, 0x10ffff,
+ 0xffffe, 0xfffff,
+ 0x10fffe, 0x10ffff,
}; /* CR_Cn */
/* 'Co': General Category */
diff --git a/enc/unicode/name2ctype.kwd b/enc/unicode/name2ctype.kwd
index 46058a8341..42e1244fe6 100644
--- a/enc/unicode/name2ctype.kwd
+++ b/enc/unicode/name2ctype.kwd
@@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = {
/* 'Assigned': - */
static const OnigCodePoint CR_Assigned[] = {
- 484,
+ 485,
0x0000, 0x0377,
0x037a, 0x037e,
0x0384, 0x038a,
@@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = {
0xe0020, 0xe007f,
0xe0100, 0xe01ef,
0xf0000, 0xffffd,
+ 0x100000, 0x10fffd,
}; /* CR_Assigned */
/* 'C': Major Category */
@@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = {
/* 'Cn': General Category */
static const OnigCodePoint CR_Cn[] = {
- 484,
+ 485,
0x0378, 0x0379,
0x037f, 0x0383,
0x038b, 0x038b,
@@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = {
0xe0002, 0xe001f,
0xe0080, 0xe00ff,
0xe01f0, 0xeffff,
- 0xffffe, 0x10ffff,
+ 0xffffe, 0xfffff,
+ 0x10fffe, 0x10ffff,
}; /* CR_Cn */
/* 'Co': General Category */
diff --git a/enc/unicode/name2ctype.src b/enc/unicode/name2ctype.src
index 46058a8341..42e1244fe6 100644
--- a/enc/unicode/name2ctype.src
+++ b/enc/unicode/name2ctype.src
@@ -3923,7 +3923,7 @@ static const OnigCodePoint CR_Any[] = {
/* 'Assigned': - */
static const OnigCodePoint CR_Assigned[] = {
- 484,
+ 485,
0x0000, 0x0377,
0x037a, 0x037e,
0x0384, 0x038a,
@@ -4408,6 +4408,7 @@ static const OnigCodePoint CR_Assigned[] = {
0xe0020, 0xe007f,
0xe0100, 0xe01ef,
0xf0000, 0xffffd,
+ 0x100000, 0x10fffd,
}; /* CR_Assigned */
/* 'C': Major Category */
@@ -4464,7 +4465,7 @@ static const OnigCodePoint CR_Cf[] = {
/* 'Cn': General Category */
static const OnigCodePoint CR_Cn[] = {
- 484,
+ 485,
0x0378, 0x0379,
0x037f, 0x0383,
0x038b, 0x038b,
@@ -4948,7 +4949,8 @@ static const OnigCodePoint CR_Cn[] = {
0xe0002, 0xe001f,
0xe0080, 0xe00ff,
0xe01f0, 0xeffff,
- 0xffffe, 0x10ffff,
+ 0xffffe, 0xfffff,
+ 0x10fffe, 0x10ffff,
}; /* CR_Cn */
/* 'Co': General Category */
diff --git a/tool/enc-unicode.rb b/tool/enc-unicode.rb
index 57edb3b3e5..6b14963217 100755
--- a/tool/enc-unicode.rb
+++ b/tool/enc-unicode.rb
@@ -2,6 +2,13 @@
# Creates the data structures needed by Onigurma to map Unicode codepoints to
# property names and POSIX character classes
+#
+# To use this, get UnicodeData.txt and Scripts.txt from unicode.org.
+# (http://unicode.org/Public/UNIDATA/)
+# And run following command.
+# ruby1.9 tool/enc-unicode.rb UnicodeData.txt Scripts.txt > enc/unicode/name2ctype.kwd
+# You can get source file for gperf.
+# After this, simply make ruby.
unless ARGV.size == 2
$stderr.puts "Usage: #{$0} UnicodeData.txt Scripts.txt"
@@ -17,10 +24,11 @@ def pair_codepoints(codepoints)
# codepoints with property _property_. Note: It is intended that some ranges
# will begin with the value with which they end, e.g. 0x0020 -> 0x0020
- codepoints = codepoints.uniq.sort
+ codepoints.sort!
last_cp = codepoints.first
pairs = [[last_cp, nil]]
codepoints[1..-1].each do |codepoint|
+ next if last_cp == codepoint
# If the current codepoint does not follow directly on from the last
# codepoint, the last codepoint represents the end of the current range,
@@ -39,7 +47,7 @@ end
def parse_unicode_data(file)
last_cp = 0
- data = {'Cn' => []}
+ data = {'Any' => [], 'Assigned' => [], 'Cn' => []}
beg_cp = nil
IO.foreach(file) do |line|
fields = line.split(';')
@@ -64,6 +72,10 @@ def parse_unicode_data(file)
# Cn category.
data['Cn'].concat((last_cp.next...beg_cp).to_a)
+ # Assigned - Defined in unicode.c; interpreted as every character in the
+ # Unicode range minus the unassigned characters
+ data['Assigned'].concat(cps)
+
# The third field denotes the 'General' category, e.g. Lu
(data[fields[2]] ||= []).concat(cps)
@@ -73,16 +85,15 @@ def parse_unicode_data(file)
last_cp = cp
end
- # General Category property
- gcps = %w[Any Assigned]
- gcps.concat data.keys.sort
-
# The last Cn codepoint should be 0x10ffff. If it's not, append the missing
# codepoints to Cn and C
- cn_remainder = (data['Cn'].last.next..0x10ffff).to_a
+ cn_remainder = (last_cp.next..0x10ffff).to_a
data['Cn'] += cn_remainder
data['C'] += cn_remainder
+ # Define General Category properties
+ gcps = data.keys.sort
+
# We now derive the character classes (POSIX brackets), e.g. [[:alpha:]]
#
@@ -145,10 +156,6 @@ def parse_unicode_data(file)
# Any - Defined in unicode.c
data['Any'] = (0x0000..0x10ffff).to_a
- # Assigned - Defined in unicode.c; interpreted as every character in the
- # Unicode range minus the unassigned characters
- data['Assigned'] = data['Any'] - data['Cn']
-
# Returns General Category Property names and the data
[gcps, data]
end