summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog18
-rw-r--r--enc/trans/iso2022.erb.c10
-rw-r--r--enc/trans/utf_16_32.erb.c41
-rw-r--r--test/ruby/test_transcode.rb24
-rw-r--r--tool/transcode-tblgen.rb53
-rw-r--r--transcode.c29
-rw-r--r--transcode_data.h2
7 files changed, 109 insertions, 68 deletions
diff --git a/ChangeLog b/ChangeLog
index 2781b23428..2c0fe8041d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+Sat Aug 9 00:42:33 2008 Tanaka Akira <akr@fsij.org>
+
+ * transcode_data.h (rb_transcoder): from_unit_length field added.
+ from_utf8 field removed.
+
+ * tool/transcode-tblgen.rb: generate offsets range.
+ follow rb_transcoder change.
+
+ * transcode.c (transcode_loop): don't use from_utf8.
+ make invalid region from_unit_length wise.
+
+ * enc/trans/iso2022.erb.c: follow rb_transcoder and
+ transcode_generate_node change.
+
+ * enc/trans/utf_16_32.erb.c: follow rb_transcoder and
+ transcode_generate_node change.
+ explicit :invalid map removed.
+
Fri Aug 8 23:29:44 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* enc/depend (TRANSCSRCS): needs rule_subst to apply.
diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c
index c3f6be693c..72553f4054 100644
--- a/enc/trans/iso2022.erb.c
+++ b/enc/trans/iso2022.erb.c
@@ -12,8 +12,8 @@
map_jisx0208_rest["{21-7e}"] = :func_so
%>
-<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %>
-<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %>
+<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp") %>
+<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest") %>
static VALUE
fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l)
@@ -57,7 +57,7 @@ fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, u
static const rb_transcoder
rb_ISO_2022_JP_to_EUC_JP = {
- "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0,
+ "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3,
NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
};
@@ -71,7 +71,7 @@ rb_ISO_2022_JP_to_EUC_JP = {
}
%>
-<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %>
+<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp") %>
static int
fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o)
@@ -129,7 +129,7 @@ finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
static const rb_transcoder
rb_EUC_JP_to_ISO_2022_JP = {
- "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0,
+ "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5,
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
};
diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c
index 67f84e74bf..2cf7560b4e 100644
--- a/enc/trans/utf_16_32.erb.c
+++ b/enc/trans/utf_16_32.erb.c
@@ -183,14 +183,12 @@ fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned
map = {}
map["{00-d7,e0-ff}{00-ff}"] = :func_so
map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so
- map["{dc-df}{00-ff}"] = :invalid
- map["{d8-db}{00-ff}{00-db,e0-ff}{00-ff}"] = :invalid
- transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE", [])
+ transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE")
%>
static const rb_transcoder
rb_from_UTF_16BE = {
- "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
+ "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4,
NULL, NULL, NULL, &fun_so_from_utf_16be
};
@@ -205,18 +203,13 @@ rb_from_UTF_16BE = {
map["f0{90-bf}{80-bf}{80-bf}"] = :func_so
map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so
map["f4{80-8f}{80-bf}{80-bf}"] = :func_so
- map["{80-c1,f5-ff}"] = :invalid
- map["e0{80-9f}"] = :invalid
- map["ed{a0-bf}"] = :invalid
- map["f0{80-8f}"] = :invalid
- map["f4{90-bf}"] = :invalid
am = ActionMap.parse(map)
- transcode_generate_node(am, "to_UTF_16BE", [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf])
+ transcode_generate_node(am, "to_UTF_16BE")
%>
static const rb_transcoder
rb_to_UTF_16BE = {
- "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
+ "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_16be
};
@@ -224,20 +217,18 @@ rb_to_UTF_16BE = {
map = {}
map["{00-ff}{00-d7,e0-ff}"] = :func_so
map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so
- map["{00-ff}{dc-df}"] = :invalid
- map["{00-ff}{d8-db}{00-ff}{00-db,e0-ff}"] = :invalid
- transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE", [])
+ transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE")
%>
static const rb_transcoder
rb_from_UTF_16LE = {
- "UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
+ "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4,
NULL, NULL, NULL, &fun_so_from_utf_16le
};
static const rb_transcoder
rb_to_UTF_16LE = {
- "UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
+ "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_16le
};
@@ -245,21 +236,18 @@ rb_to_UTF_16LE = {
map = {}
map["0000{00-d7,e0-ff}{00-ff}"] = :func_so
map["00{01-10}{00-ff}{00-ff}"] = :func_so
- map["00{11-ff}{00-ff}{00-ff}"] = :invalid
- map["0000{d8-df}{00-ff}"] = :invalid
- map["{01-ff}{00-ff}{00-ff}{00-ff}"] = :invalid
- transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE", [])
+ transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE")
%>
static const rb_transcoder
rb_from_UTF_32BE = {
- "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
+ "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4,
NULL, NULL, NULL, &fun_so_from_utf_32be
};
static const rb_transcoder
rb_to_UTF_32BE = {
- "UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
+ "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_32be
};
@@ -267,21 +255,18 @@ rb_to_UTF_32BE = {
map = {}
map["{00-ff}{00-d7,e0-ff}0000"] = :func_so
map["{00-ff}{00-ff}{01-10}00"] = :func_so
- map["{00-ff}{00-ff}{00-ff}{01-ff}"] = :invalid
- map["{00-ff}{00-ff}{11-ff}00"] = :invalid
- map["{00-ff}{d8-df}0000"] = :invalid
- transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE", [])
+ transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE")
%>
static const rb_transcoder
rb_from_UTF_32LE = {
- "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
+ "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4,
NULL, NULL, NULL, &fun_so_from_utf_32le
};
static const rb_transcoder
rb_to_UTF_32LE = {
- "UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
+ "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_32le
};
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb
index 26e56ffb0c..095028e42a 100644
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@@ -267,8 +267,30 @@ class TestTranscode < Test::Unit::TestCase
"\x80".encode("UTF-32BE", "UTF-8", invalid: :replace))
assert_equal("\xFD\xFF\x00\x00".force_encoding("UTF-32LE"),
"\x80".encode("UTF-32LE", "UTF-8", invalid: :replace))
+
+ assert_equal("\uFFFD!",
+ "\xdc\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
+ assert_equal("\uFFFD!",
+ "\xd8\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
+
+ assert_equal("\uFFFD!",
+ "\x00\xdc!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
+ assert_equal("\uFFFD!",
+ "\x00\xd8!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
+
+ assert_equal("\uFFFD!",
+ "\x01\x00\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]")
+ assert_equal("\uFFFD!",
+ "\x00\xff\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
+ assert_equal("\uFFFD!",
+ "\x00\x00\xd8\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
+
+ assert_equal("\uFFFD!",
+ "\xff!".encode("utf-8", "euc-jp", :invalid=>:replace))
+ assert_equal("\uFFFD!",
+ "\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
assert_equal("\uFFFD!",
- "\x01\x00\x00\x00\x00\x00\x00\x21".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]")
+ "\x8f\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
end
def test_undef_replace
diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb
index 767ea0bbf3..3a20b3f0b1 100644
--- a/tool/transcode-tblgen.rb
+++ b/tool/transcode-tblgen.rb
@@ -213,13 +213,16 @@ class ActionMap
OffsetsMemo = {}
InfosMemo = {}
- def format_offsets(offsets)
- code = "{\n"
+ def format_offsets(min, max, offsets)
+ offsets = offsets[min..max]
+ code = "{ %d, %d,\n" % [min, max]
0.step(offsets.length-1,16) {|i|
code << " "
code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
- code << " "
- code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+ if i+8 < offsets.length
+ code << " "
+ code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
+ end
code << "\n"
}
code << '}'
@@ -276,14 +279,22 @@ class ActionMap
offsets = []
infos = []
infomap = {}
+ min = max = nil
table.each_with_index {|action, byte|
action ||= :invalid
+ if action != :invalid
+ min = byte if !min
+ max = byte
+ end
unless o = infomap[action]
infomap[action] = o = infos.length
infos[o] = action
end
offsets[byte] = o
}
+ if !min
+ min = max = 0
+ end
if n = OffsetsMemo[offsets]
offsets_name = n
@@ -292,7 +303,7 @@ class ActionMap
offsets_name = "#{name}_offsets"
offsets_code = <<"End"
static const unsigned char
-#{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)};
+#{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
End
OffsetsMemo[offsets] = offsets_name
end
@@ -324,24 +335,19 @@ End
PostMemo = {}
NextName = "a"
- def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil)
- ranges = [0x00..0xff] if ranges.empty?
- range = ranges.first
+ def generate_node(code, name_hint=nil, valid_encoding=nil)
if n = PreMemo[[self,valid_encoding]]
return n
end
- table = Array.new(range.end - range.begin + 1)
+ table = Array.new(0x100, :invalid)
each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
- unless range === byte
- raise "byte not in range"
- end
if a = rest.empty_action
- table[byte-range.begin] = a
+ table[byte] = a
else
name_hint2 = nil
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
- table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding)
+ table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
end
}
@@ -386,9 +392,8 @@ def transcode_compile_tree(name, from, map)
valid_encoding = nil
end
- ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
code = ''
- defined_name = am.generate_node(code, name, ranges, valid_encoding)
+ defined_name = am.generate_node(code, name, valid_encoding)
return defined_name, code
end
@@ -409,22 +414,22 @@ def transcode_tblgen(from, to, map)
real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map)
transcoder_name = "rb_#{tree_name}"
TRANSCODERS << transcoder_name
- from_utf8 = from == 'UTF-8' ? 1 : 0
+ from_unit_length = UnitLength[from]
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
transcoder_code = <<"End"
static const rb_transcoder
#{transcoder_name} = {
- #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8},
+ #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output},
NULL, NULL,
};
End
tree_code + "\n" + transcoder_code
end
-def transcode_generate_node(am, name_hint=nil, ranges=[])
+def transcode_generate_node(am, name_hint=nil)
STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
code = ''
- am.generate_node(code, name_hint, ranges)
+ am.generate_node(code, name_hint)
code
end
@@ -436,6 +441,14 @@ def transcode_register_code
code
end
+UnitLength = {
+ 'UTF-16BE' => 2,
+ 'UTF-16LE' => 2,
+ 'UTF-32BE' => 4,
+ 'UTF-32LE' => 4,
+}
+UnitLength.default = 1
+
ValidEncoding = {
'1byte' => '{00-ff}',
'2byte' => '{00-ff}{00-ff}',
diff --git a/transcode.c b/transcode.c
index 3a1ab70a81..75a802572c 100644
--- a/transcode.c
+++ b/transcode.c
@@ -336,10 +336,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start;
const BYTE_LOOKUP *next_table;
const unsigned char *char_start;
- unsigned int next_offset;
VALUE next_info;
unsigned char next_byte;
- int from_utf8 = my_transcoder->from_utf8;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
@@ -355,8 +353,12 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
}
next_byte = (unsigned char)*in_p++;
follow_byte:
- next_offset = next_table->base[next_byte];
- next_info = (VALUE)next_table->info[next_offset];
+ if (next_byte < next_table->base[0] || next_table->base[1] < next_byte)
+ next_info = INVALID;
+ else {
+ unsigned int next_offset = next_table->base[2+next_byte-next_table->base[0]];
+ next_info = (VALUE)next_table->info[next_offset];
+ }
follow_info:
switch (next_info & 0x1F) {
case NOMAP:
@@ -370,14 +372,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
goto invalid;
}
next_byte = (unsigned char)*in_p++;
- if (from_utf8) {
- if ((next_byte&0xC0) == 0x80)
- next_byte -= 0x80;
- else {
- in_p--; /* may need to add more code later to revert other things */
- goto invalid;
- }
- }
next_table = (const BYTE_LOOKUP *)next_info;
goto follow_byte;
/* maybe rewrite the following cases to use fallthrough???? */
@@ -411,7 +405,16 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p);
break;
case INVALID:
- goto invalid;
+ {
+ int unitlen = my_transcoder->from_unit_length;
+ if (in_stop - char_start <= unitlen)
+ in_p = in_stop;
+ else if (in_p - char_start <= unitlen)
+ in_p = char_start + unitlen;
+ else
+ in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen;
+ goto invalid;
+ }
case UNDEF:
goto undef;
}
diff --git a/transcode_data.h b/transcode_data.h
index 92f8ade436..ba2e6e99b3 100644
--- a/transcode_data.h
+++ b/transcode_data.h
@@ -72,8 +72,8 @@ typedef struct rb_transcoder {
const char *from_encoding;
const char *to_encoding;
const BYTE_LOOKUP *conv_tree_start;
+ int from_unit_length;
int max_output;
- int from_utf8;
VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */