summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-09 06:02:01 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-09 06:02:01 +0000
commit139234e1a091ac3167d3bebdfcd29b3952665334 (patch)
treec1f7be47d34829813983af23e46340bf806d4ff4
parent750cb61e65f35feb70ad855ab3353485cfd75b0c (diff)
* transcode_data.h (rb_transcoding): add fields for restartable
transcoding. (rb_transcoder): add max_input field. from_unit_length field is renamed to input_unit_length. * tool/transcode-tblgen.rb: generate max_input field. * enc/trans/iso2022.erb.c: follow rb_transcoder change. * enc/trans/utf_16_32.erb.c: ditto. * transcode.c (PARTIAL_INPUT): new constant. (transcode_char_start): new function. (transcode_result_t): new type. (transcode_restartable): new function. (more_output_buffer): new function. (transcode_loop): use transcode_restartable. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18452 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog20
-rw-r--r--enc/trans/iso2022.erb.c10
-rw-r--r--enc/trans/utf_16_32.erb.c40
-rw-r--r--tool/transcode-tblgen.rb35
-rw-r--r--transcode.c399
-rw-r--r--transcode_data.h15
6 files changed, 438 insertions, 81 deletions
diff --git a/ChangeLog b/ChangeLog
index 40f67df545..902bd6831b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,23 @@
+Sat Aug 9 14:39:34 2008 Tanaka Akira <akr@fsij.org>
+
+ * transcode_data.h (rb_transcoding): add fields for restartable
+ transcoding.
+ (rb_transcoder): add max_input field.
+ from_unit_length field is renamed to input_unit_length.
+
+ * tool/transcode-tblgen.rb: generate max_input field.
+
+ * enc/trans/iso2022.erb.c: follow rb_transcoder change.
+
+ * enc/trans/utf_16_32.erb.c: ditto.
+
+ * transcode.c (PARTIAL_INPUT): new constant.
+ (transcode_char_start): new function.
+ (transcode_result_t): new type.
+ (transcode_restartable): new function.
+ (more_output_buffer): new function.
+ (transcode_loop): use transcode_restartable.
+
Sat Aug 9 13:35:08 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* stable/ext/socket/socket.c (NI_MAXHOST, NI_MAXSERV): fixed invalid
diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c
index 72553f4054..3209fad163 100644
--- a/enc/trans/iso2022.erb.c
+++ b/enc/trans/iso2022.erb.c
@@ -57,7 +57,10 @@ fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, u
static const rb_transcoder
rb_ISO_2022_JP_to_EUC_JP = {
- "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3,
+ "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp,
+ 1, /* input_unit_length */
+ 3, /* max_input */
+ 3, /* max_output */
NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
};
@@ -129,7 +132,10 @@ finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
static const rb_transcoder
rb_EUC_JP_to_ISO_2022_JP = {
- "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5,
+ "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp,
+ 1, /* input_unit_length */
+ 3, /* max_input */
+ 5, /* max_output */
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
};
diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c
index 86f1ed4727..aea2ab50a8 100644
--- a/enc/trans/utf_16_32.erb.c
+++ b/enc/trans/utf_16_32.erb.c
@@ -231,7 +231,10 @@ fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned
static const rb_transcoder
rb_from_UTF_16BE = {
- "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4,
+ "UTF-16BE", "UTF-8", &from_UTF_16BE,
+ 2, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_from_utf_16be
};
@@ -252,7 +255,10 @@ rb_from_UTF_16BE = {
static const rb_transcoder
rb_to_UTF_16BE = {
- "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4,
+ "UTF-8", "UTF-16BE", &to_UTF_16BE,
+ 1, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_to_utf_16be
};
@@ -265,13 +271,19 @@ rb_to_UTF_16BE = {
static const rb_transcoder
rb_from_UTF_16LE = {
- "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4,
+ "UTF-16LE", "UTF-8", &from_UTF_16LE,
+ 2, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_from_utf_16le
};
static const rb_transcoder
rb_to_UTF_16LE = {
- "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4,
+ "UTF-8", "UTF-16LE", &to_UTF_16BE,
+ 1, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_to_utf_16le
};
@@ -284,13 +296,19 @@ rb_to_UTF_16LE = {
static const rb_transcoder
rb_from_UTF_32BE = {
- "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4,
+ "UTF-32BE", "UTF-8", &from_UTF_32BE,
+ 4, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_from_utf_32be
};
static const rb_transcoder
rb_to_UTF_32BE = {
- "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4,
+ "UTF-8", "UTF-32BE", &to_UTF_16BE,
+ 1, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_to_utf_32be
};
@@ -303,13 +321,19 @@ rb_to_UTF_32BE = {
static const rb_transcoder
rb_from_UTF_32LE = {
- "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4,
+ "UTF-32LE", "UTF-8", &from_UTF_32LE,
+ 4, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_from_utf_32le
};
static const rb_transcoder
rb_to_UTF_32LE = {
- "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4,
+ "UTF-8", "UTF-32LE", &to_UTF_16BE,
+ 1, /* input_unit_length */
+ 4, /* max_input */
+ 4, /* max_output */
NULL, NULL, NULL, &fun_so_to_utf_32le
};
diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb
index 3a20b3f0b1..119fa0d1cb 100644
--- a/tool/transcode-tblgen.rb
+++ b/tool/transcode-tblgen.rb
@@ -101,6 +101,22 @@ class StrSet
"\#<#{self.class}: #{self.to_s}>"
end
+ def min_length
+ if @pat.empty?
+ nil
+ else
+ @pat.map {|seq| seq.length }.min
+ end
+ end
+
+ def max_length
+ if @pat.empty?
+ nil
+ else
+ @pat.map {|seq| seq.length }.max
+ end
+ end
+
def emptyable?
@pat.any? {|seq|
seq.empty?
@@ -170,6 +186,10 @@ class ActionMap
">"
end
+ def max_input_length
+ @map.keys.map {|k| k.max_length }.max
+ end
+
def empty_action
@map.each {|ss, action|
return action if ss.emptyable?
@@ -386,6 +406,8 @@ def transcode_compile_tree(name, from, map)
}
am = ActionMap.parse(h)
+ max_input = am.max_input_length
+
if ValidEncoding[from]
valid_encoding = StrSet.parse(ValidEncoding[from])
else
@@ -394,7 +416,7 @@ def transcode_compile_tree(name, from, map)
code = ''
defined_name = am.generate_node(code, name, valid_encoding)
- return defined_name, code
+ return defined_name, code, max_input
end
TRANSCODERS = []
@@ -411,16 +433,19 @@ def transcode_tblgen(from, to, map)
tree_name = "from_#{id_from}_to_#{id_to}"
end
map = encode_utf8(map)
- real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map)
+ real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map)
transcoder_name = "rb_#{tree_name}"
TRANSCODERS << transcoder_name
- from_unit_length = UnitLength[from]
+ input_unit_length = UnitLength[from]
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
transcoder_code = <<"End"
static const rb_transcoder
#{transcoder_name} = {
- #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output},
- NULL, NULL,
+ #{c_esc from}, #{c_esc to}, &#{real_tree_name},
+ #{input_unit_length}, /* input_unit_length */
+ #{max_input}, /* max_input */
+ #{max_output}, /* max_output */
+ NULL, NULL, NULL, NULL, NULL
};
End
tree_code + "\n" + transcoder_code
diff --git a/transcode.c b/transcode.c
index 75a802572c..4c979e8c41 100644
--- a/transcode.c
+++ b/transcode.c
@@ -20,6 +20,7 @@ static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
#define INVALID_REPLACE 0x2
#define UNDEF_IGNORE 0x10
#define UNDEF_REPLACE 0x20
+#define PARTIAL_INPUT 0x100
/*
* Dispatch data and logic
@@ -324,34 +325,117 @@ output_replacement_character(unsigned char **out_pp, rb_encoding *enc)
/*
* Transcoding engine logic
*/
-static void
-transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
- const unsigned char *in_stop, unsigned char *out_stop,
- const rb_transcoder *my_transcoder,
- rb_transcoding *my_transcoding,
- const int opt)
+
+static const unsigned char *
+transcode_char_start(rb_transcoding *my_transcoding,
+ const unsigned char **in_pos,
+ const unsigned char *in_p,
+ int readlen)
+{
+ const unsigned char *ptr;
+ if (in_p - *in_pos < readlen) {
+ int restlen = readlen - my_transcoding->readlen;
+ MEMCPY(TRANSCODING_READBUF(my_transcoding) + my_transcoding->readlen,
+ in_p - restlen, unsigned char, restlen);
+ my_transcoding->readlen = readlen;
+ ptr = TRANSCODING_READBUF(my_transcoding);
+ }
+ else {
+ ptr = in_p - readlen;
+ }
+ return ptr;
+}
+
+typedef enum {
+ transcode_invalid_input,
+ transcode_undefined_conversion,
+ transcode_obuf_full,
+ transcode_ibuf_empty,
+ transcode_finished,
+} transcode_result_t;
+
+static transcode_result_t
+transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
+ const unsigned char *in_stop, unsigned char *out_stop,
+ const rb_transcoder *my_transcoder,
+ rb_transcoding *my_transcoding,
+ const int opt)
+
{
- const unsigned char *in_p = *in_pos;
- unsigned char *out_p = *out_pos;
- const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start;
+ int unitlen = my_transcoder->input_unit_length;
+
+ const unsigned char *in_p;
+ unsigned char *out_p;
+ int readlen;
const BYTE_LOOKUP *next_table;
- const unsigned char *char_start;
- VALUE next_info;
- unsigned char next_byte;
- unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
- rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
-
- while (in_p < in_stop) {
- char_start = in_p;
- next_table = conv_tree_start;
- if (out_p >= out_s) {
- int len = (out_p - *out_pos);
- int new_len = (len + my_transcoder->max_output) * 2;
- *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
- out_p = *out_pos + len;
- out_s = *out_pos + new_len - my_transcoder->max_output;
- }
+
+ unsigned char empty_buf;
+ unsigned char *empty_ptr = &empty_buf;
+
+ if (!in_pos) {
+ in_pos = (const unsigned char **)&empty_ptr;
+ in_stop = empty_ptr;
+ }
+
+ if (!out_pos) {
+ out_pos = &empty_ptr;
+ out_stop = empty_ptr;
+ }
+
+ in_p = *in_pos;
+ out_p = *out_pos;
+ readlen = my_transcoding->readlen;
+ next_table = my_transcoding->next_table;
+
+#define SUSPEND(ret, num) \
+ do { \
+ my_transcoding->resume_position = (num); \
+ if (my_transcoding->readlen < readlen) \
+ MEMCPY(TRANSCODING_READBUF(my_transcoding)+my_transcoding->readlen, \
+ in_p - (readlen-my_transcoding->readlen), \
+ unsigned char, \
+ readlen-my_transcoding->readlen); \
+ *in_pos = in_p; \
+ *out_pos = out_p; \
+ my_transcoding->readlen = readlen; \
+ my_transcoding->next_table = next_table; \
+ return ret; \
+ resume_label ## num:; \
+ } while (0)
+
+ switch (my_transcoding->resume_position) {
+ case 0: break;
+ case 1: goto resume_label1;
+ case 2: goto resume_label2;
+ case 3: goto resume_label3;
+ case 4: goto resume_label4;
+ case 5: goto resume_label5;
+ case 6: goto resume_label6;
+ case 7: goto resume_label7;
+ case 8: goto resume_label8;
+ case 9: goto resume_label9;
+ case 10: goto resume_label10;
+ case 11: goto resume_label11;
+ case 12: goto resume_label12;
+ case 13: goto resume_label13;
+ case 14: goto resume_label14;
+ }
+
+ while (1) {
+ unsigned char next_byte;
+ VALUE next_info;
+
+ if (in_stop <= in_p) {
+ if (!(opt & PARTIAL_INPUT))
+ break;
+ SUSPEND(transcode_ibuf_empty, 7);
+ continue;
+ }
+
+ my_transcoding->readlen = readlen = 0;
+ next_table = my_transcoder->conv_tree_start;
next_byte = (unsigned char)*in_p++;
+ readlen++;
follow_byte:
if (next_byte < next_table->base[0] || next_table->base[1] < next_byte)
next_info = INVALID;
@@ -361,32 +445,42 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
}
follow_info:
switch (next_info & 0x1F) {
- case NOMAP:
+ case NOMAP: /* xxx: copy last byte only? */
+ while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 3); }
*out_p++ = next_byte;
continue;
case 0x00: case 0x04: case 0x08: case 0x0C:
case 0x10: case 0x14: case 0x18: case 0x1C:
- if (in_p >= in_stop) {
- /* todo: deal with the case of backtracking */
- /* todo: deal with incomplete input (streaming) */
- goto invalid;
+ while (in_p >= in_stop) {
+ if (!(opt & PARTIAL_INPUT))
+ goto invalid;
+ SUSPEND(transcode_ibuf_empty, 5);
}
next_byte = (unsigned char)*in_p++;
+ readlen++;
next_table = (const BYTE_LOOKUP *)next_info;
goto follow_byte;
/* maybe rewrite the following cases to use fallthrough???? */
case ZERObt: /* drop input */
continue;
case ONEbt:
+ while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 9); }
*out_p++ = getBT1(next_info);
continue;
case TWObt:
+ while (out_stop - out_p < 2) { SUSPEND(transcode_obuf_full, 10); }
+ *out_p++ = getBT1(next_info);
+ *out_p++ = getBT2(next_info);
+ continue;
+ case THREEbt:
+ while (out_stop - out_p < 3) { SUSPEND(transcode_obuf_full, 11); }
*out_p++ = getBT1(next_info);
*out_p++ = getBT2(next_info);
+ *out_p++ = getBT3(next_info);
continue;
case FOURbt:
+ while (out_stop - out_p < 4) { SUSPEND(transcode_obuf_full, 12); }
*out_p++ = getBT0(next_info);
- case THREEbt: /* fall through */
*out_p++ = getBT1(next_info);
*out_p++ = getBT2(next_info);
*out_p++ = getBT3(next_info);
@@ -395,70 +489,245 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
next_info = (VALUE)(*my_transcoder->func_ii)(my_transcoding, next_info);
goto follow_info;
case FUNsi:
- next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, char_start, (size_t)(in_p-char_start));
- goto follow_info;
- break;
+ {
+ const unsigned char *char_start;
+ char_start = transcode_char_start(my_transcoding, in_pos, in_p, readlen);
+ next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, char_start, (size_t)readlen);
+ break;
+ }
case FUNio:
+ while (out_stop - out_p < my_transcoder->max_output) { SUSPEND(transcode_obuf_full, 13); }
out_p += (VALUE)(*my_transcoder->func_io)(my_transcoding, next_info, out_p);
break;
case FUNso:
- out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p);
- break;
+ {
+ const unsigned char *char_start;
+ while (out_stop - out_p < my_transcoder->max_output) { SUSPEND(transcode_obuf_full, 14); }
+ char_start = transcode_char_start(my_transcoding, in_pos, in_p, readlen);
+ out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)readlen, out_p);
+ break;
+ }
case INVALID:
{
- int unitlen = my_transcoder->from_unit_length;
- if (in_stop - char_start <= unitlen)
- in_p = in_stop;
- else if (in_p - char_start <= unitlen)
- in_p = char_start + unitlen;
- else
- in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen;
+ if (readlen <= unitlen) {
+ while ((opt & PARTIAL_INPUT) && readlen + (in_stop - in_p) < unitlen) {
+ readlen += in_stop - in_p;
+ in_p = in_stop;
+ SUSPEND(transcode_ibuf_empty, 8);
+ }
+ if (readlen + (in_stop - in_p) <= unitlen)
+ in_p = in_stop;
+ else
+ in_p += unitlen - readlen;
+ }
+ else {
+ /* xxx: possibly in_p is lesser than *in_pos
+ * caller may want to access readbuf. */
+ in_p += ((readlen - 1) / unitlen) * unitlen - readlen;
+ }
goto invalid;
}
case UNDEF:
goto undef;
}
continue;
+
invalid:
+ SUSPEND(transcode_invalid_input, 1);
+ continue;
+
+ undef:
+ SUSPEND(transcode_undefined_conversion, 2);
+ continue;
+ }
+
+ /* cleanup */
+ if (my_transcoder->finish_func) {
+ while (out_stop - out_p < my_transcoder->max_output) {
+ SUSPEND(transcode_obuf_full, 4);
+ }
+ out_p += my_transcoder->finish_func(my_transcoding, out_p);
+ }
+ while (1)
+ SUSPEND(transcode_finished, 6);
+#undef SUSPEND
+}
+
+static void
+more_output_buffer(
+ rb_transcoding *my_transcoding,
+ unsigned char **out_start_ptr,
+ unsigned char **out_pos,
+ unsigned char **out_stop_ptr)
+{
+ size_t len = (*out_pos - *out_start_ptr);
+ size_t new_len = (len + my_transcoding->transcoder->max_output) * 2;
+ *out_start_ptr = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
+ *out_pos = *out_start_ptr + len;
+ *out_stop_ptr = *out_start_ptr + new_len;
+}
+
+#if 1
+static void
+transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
+ const unsigned char *in_stop, unsigned char *out_stop,
+ const rb_transcoder *my_transcoder,
+ rb_transcoding *my_transcoding,
+ const int opt)
+{
+ transcode_result_t ret;
+ unsigned char *out_start = *out_pos;
+
+ my_transcoding->resume_position = 0;
+ my_transcoding->readlen = 0;
+
+ if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) {
+ my_transcoding->readbuf.ptr = xmalloc(my_transcoder->max_input);
+ }
+#define CLEANUP \
+ do { \
+ if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) \
+ xfree(my_transcoding->readbuf.ptr); \
+ } while(0)
+
+resume:
+ ret = transcode_restartable(in_pos, out_pos, in_stop, out_stop, my_transcoder, my_transcoding, opt);
+ if (ret == transcode_invalid_input) {
/* deal with invalid byte sequence */
/* todo: add more alternative behaviors */
if (opt&INVALID_IGNORE) {
- continue;
+ goto resume;
}
else if (opt&INVALID_REPLACE) {
- output_replacement_character(&out_p, to_encoding);
- continue;
+ if (out_stop - *out_pos < my_transcoder->max_output)
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding));
+ goto resume;
}
+ CLEANUP;
rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
- continue;
- undef:
+ }
+ if (ret == transcode_undefined_conversion) {
/* valid character in from encoding
* but no related character(s) in to encoding */
/* todo: add more alternative behaviors */
if (opt&UNDEF_IGNORE) {
- continue;
+ goto resume;
}
else if (opt&UNDEF_REPLACE) {
- output_replacement_character(&out_p, to_encoding);
- continue;
+ if (out_stop - *out_pos < my_transcoder->max_output)
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding));
+ goto resume;
}
- rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
- continue;
+ CLEANUP;
+ rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
}
- /* cleanup */
- if (my_transcoder->finish_func) {
- if (out_p >= out_s) {
- int len = (out_p - *out_pos);
- int new_len = (len + my_transcoder->max_output) * 2;
- *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
- out_p = *out_pos + len;
- out_s = *out_pos + new_len - my_transcoder->max_output;
- }
- out_p += my_transcoder->finish_func(my_transcoding, out_p);
+ if (ret == transcode_obuf_full) {
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ goto resume;
}
- *in_pos = in_p;
- *out_pos = out_p;
+
+ CLEANUP;
+ return;
+#undef CLEANUP
+}
+#else
+/* sample transcode_loop implementation in byte-by-byte stream style */
+static void
+transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
+ const unsigned char *in_stop, unsigned char *out_stop,
+ const rb_transcoder *my_transcoder,
+ rb_transcoding *my_transcoding,
+ const int opt)
+{
+ transcode_result_t ret;
+ unsigned char *out_start = *out_pos;
+ const unsigned char *ptr;
+
+ my_transcoding->resume_position = 0;
+ my_transcoding->readlen = 0;
+
+ if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) {
+ my_transcoding->readbuf.ptr = xmalloc(my_transcoder->max_input);
+ }
+#define CLEANUP \
+ do { \
+ if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) \
+ xfree(my_transcoding->readbuf.ptr); \
+ } while(0)
+
+ ret = transcode_ibuf_empty;
+ ptr = *in_pos;
+ while (ret != transcode_finished) {
+ unsigned char input_byte;
+ const unsigned char *p = &input_byte;
+
+ if (ret == transcode_ibuf_empty) {
+ if (ptr < in_stop) {
+ input_byte = *ptr;
+ ret = transcode_restartable(&p, out_pos, p+1, out_stop, my_transcoder, my_transcoding, opt|PARTIAL_INPUT);
+ }
+ else {
+ ret = transcode_restartable(NULL, out_pos, NULL, out_stop, my_transcoder, my_transcoding, opt);
+ }
+ }
+ else {
+ ret = transcode_restartable(NULL, out_pos, NULL, out_stop, my_transcoder, my_transcoding, opt|PARTIAL_INPUT);
+ }
+ if (&input_byte != p)
+ ptr += p - &input_byte;
+ switch (ret) {
+ case transcode_invalid_input:
+ /* deal with invalid byte sequence */
+ /* todo: add more alternative behaviors */
+ if (opt&INVALID_IGNORE) {
+ break;
+ }
+ else if (opt&INVALID_REPLACE) {
+ if (out_stop - *out_pos < my_transcoder->max_output)
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding));
+ break;
+ }
+ CLEANUP;
+ rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
+ break;
+
+ case transcode_undefined_conversion:
+ /* valid character in from encoding
+ * but no related character(s) in to encoding */
+ /* todo: add more alternative behaviors */
+ if (opt&UNDEF_IGNORE) {
+ break;
+ }
+ else if (opt&UNDEF_REPLACE) {
+ if (out_stop - *out_pos < my_transcoder->max_output)
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding));
+ break;
+ }
+ CLEANUP;
+ rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
+ break;
+
+ case transcode_obuf_full:
+ more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop);
+ break;
+
+ case transcode_ibuf_empty:
+ break;
+
+ case transcode_finished:
+ break;
+ }
+ }
+ CLEANUP;
+ *in_pos = in_stop;
+ return;
+#undef CLEANUP
}
+#endif
/*
diff --git a/transcode_data.h b/transcode_data.h
index ba2e6e99b3..3801c38ec8 100644
--- a/transcode_data.h
+++ b/transcode_data.h
@@ -64,15 +64,28 @@ typedef struct rb_transcoding {
or NULL if something else is being converted */
unsigned char *(*flush_func)(struct rb_transcoding*, int, int);
+ int resume_position;
+ const BYTE_LOOKUP *next_table;
+ int readlen;
+ union {
+ unsigned char ary[8]; /* max_input <= sizeof(ary) */
+ unsigned char *ptr; /* length is max_input */
+ } readbuf;
+
unsigned char stateful[256]; /* opaque data for stateful encoding */
} rb_transcoding;
+#define TRANSCODING_READBUF(tc) \
+ ((tc)->transcoder->max_input <= sizeof((tc)->readbuf.ary) ? \
+ (tc)->readbuf.ary : \
+ (tc)->readbuf.ptr)
/* static structure, one per supported encoding pair */
typedef struct rb_transcoder {
const char *from_encoding;
const char *to_encoding;
const BYTE_LOOKUP *conv_tree_start;
- int from_unit_length;
+ int input_unit_length;
+ int max_input;
int max_output;
VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */