summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog16
-rw-r--r--enc/trans/iso2022.trans17
-rw-r--r--include/ruby/encoding.h10
-rw-r--r--tool/transcode-tblgen.rb3
-rw-r--r--transcode.c114
-rw-r--r--transcode_data.h3
6 files changed, 114 insertions, 49 deletions
diff --git a/ChangeLog b/ChangeLog
index 457bdedd05..94bd6e9ca2 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+Fri Aug 15 00:52:40 2008 Tanaka Akira <akr@fsij.org>
+
+ * include/ruby/encoding.h (rb_econv_output): declared.
+
+ * transcode_data.h (rb_transcoder): add resetsize_func field.
+
+ * enc/trans/iso2022.trans (iso2022jp_reset_sequence_size): defined.
+ (rb_EUC_JP_to_ISO_2022_JP): provede resetsize_func.
+
+ * tool/transcode-tblgen.rb: set NULL for resetsize_func.
+
+ * transcode.c (rb_econv_output): new function for inserting output.
+ (output_replacement_character): use rb_econv_output.
+ (transcode_loop): check return value of
+ output_replacement_character.
+
Thu Aug 14 23:47:21 2008 Tanaka Akira <akr@fsij.org>
* include/ruby/encoding.h (ECONV_UNIVERSAL_NEWLINE_DECODER): defined.
diff --git a/enc/trans/iso2022.trans b/enc/trans/iso2022.trans
index 1d015eea54..0414493635 100644
--- a/enc/trans/iso2022.trans
+++ b/enc/trans/iso2022.trans
@@ -83,7 +83,7 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
if (t->stateful[0] == 0) {
t->stateful[0] = 1; /* initialized flag */
- t->stateful[1] = 1; /* ASCII mode */
+ t->stateful[1] = 1; /* G0 = ASCII */
}
if (l != t->stateful[1]) {
@@ -91,13 +91,13 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
*o++ = 0x1b;
*o++ = '(';
*o++ = 'B';
- t->stateful[1] = 1;
+ t->stateful[1] = 1; /* G0 = ASCII */
}
else {
*o++ = 0x1b;
*o++ = '$';
*o++ = 'B';
- t->stateful[1] = 2;
+ t->stateful[1] = 2; /* G0 = JIS X 0208 1983 */
}
}
@@ -113,6 +113,14 @@ fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, u
}
static int
+iso2022jp_reset_sequence_size(rb_transcoding *t)
+{
+ if (t->stateful[1] == 2)
+ return 3;
+ return 0;
+}
+
+static int
finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
{
unsigned char *output0 = o;
@@ -137,7 +145,8 @@ rb_EUC_JP_to_ISO_2022_JP = {
3, /* max_input */
5, /* max_output */
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
- finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
+ finish_eucjp_to_iso2022jp,
+ iso2022jp_reset_sequence_size, finish_eucjp_to_iso2022jp
};
void
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 7e188a0c1f..d7ad0d0237 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -230,11 +230,15 @@ typedef struct {
rb_encoding *destination_encoding;
} rb_econv_t;
-rb_econv_t *rb_econv_open(const char *from, const char *to, int flags);
+rb_econv_t *rb_econv_open(const char *source_encoding, const char *destination_encoding, int flags);
rb_econv_result_t rb_econv_convert(rb_econv_t *ec,
- const unsigned char **input_ptr, const unsigned char *input_stop,
- unsigned char **output_ptr, unsigned char *output_stop,
+ const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end,
+ unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
int flags);
+int rb_econv_output(rb_econv_t *ec,
+ const unsigned char *str, size_t len,
+ unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
+ size_t *required_size);
void rb_econv_close(rb_econv_t *ec);
/* flags for rb_econv_open */
diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb
index b0d35f6230..87bb29c885 100644
--- a/tool/transcode-tblgen.rb
+++ b/tool/transcode-tblgen.rb
@@ -449,7 +449,8 @@ static const rb_transcoder
#{input_unit_length}, /* input_unit_length */
#{max_input}, /* max_input */
#{max_output}, /* max_output */
- NULL, NULL, NULL, NULL, NULL, NULL
+ NULL, NULL, NULL, NULL,
+ NULL, NULL, NULL
};
End
tree_code + "\n" + transcoder_code
diff --git a/transcode.c b/transcode.c
index 2a68b4ebd9..b8c8d1a1ae 100644
--- a/transcode.c
+++ b/transcode.c
@@ -937,6 +937,58 @@ rb_econv_convert(rb_econv_t *ec,
return res;
}
+int
+rb_econv_output(rb_econv_t *ec,
+ const unsigned char *str, size_t len, /* string in destination encoding */
+ unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end,
+ size_t *required_size)
+{
+ size_t reset_len, total_len;
+ rb_transcoding *tc = ec->last_tc;
+ const rb_transcoder *tr = tc->transcoder;
+
+ /*
+ * Assumption for stateful encoding:
+ *
+ * - str can be output on resetted state and doesn't change the state.
+ * - it is acceptable that extra state changing sequence if str contains
+ * a state changing sequence.
+ *
+ * Currently the replacement character for stateful encoding such as
+ * ISO-2022-JP is "?" and it has no state changing sequence.
+ * So the extra state changing sequence don't occur when
+ * rb_econv_output is used for replacement characters.
+ *
+ * Thease assumption may be removed in future.
+ * It needs to scan str to check state changing sequences in it.
+ */
+
+ reset_len = 0;
+ if (tr->resetsize_func) {
+ reset_len = tr->resetsize_func(tc);
+ }
+
+ total_len = reset_len + len;
+ if (total_len < len)
+ return -1;
+
+ if (required_size) {
+ *required_size = total_len;
+ }
+
+ if (destination_buffer_end - *destination_buffer_ptr < total_len)
+ return -1;
+
+ if (reset_len) {
+ *destination_buffer_ptr += tr->resetstate_func(tc, *destination_buffer_ptr);
+ }
+
+ memcpy(*destination_buffer_ptr, str, len);
+ *destination_buffer_ptr += len;
+
+ return 0;
+}
+
void
rb_econv_close(rb_econv_t *ec)
{
@@ -968,58 +1020,40 @@ more_output_buffer(
*out_stop_ptr = *out_start_ptr + new_len;
}
-static void
+static int
output_replacement_character(
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
- rb_transcoding *tc,
+ rb_econv_t *ec,
unsigned char **out_start_ptr,
unsigned char **out_pos,
unsigned char **out_stop_ptr)
{
+ rb_transcoding *tc = ec->last_tc;
const rb_transcoder *tr;
- int max_output;
rb_encoding *enc;
- const char *replacement;
+ const unsigned char *replacement;
int len;
+ size_t required_size;
tr = tc->transcoder;
- max_output = tr->max_output;
enc = rb_enc_find(tr->to_encoding);
- /*
- * Assumption for stateful encoding:
- *
- * - The replacement character can be output on resetted state and doesn't
- * change the state.
- * - it is acceptable that extra state changing sequence if the replacement
- * character contains a state changing sequence.
- *
- * Currently the replacement character for stateful encoding such as
- * ISO-2022-JP is "?" and it has no state changing sequence.
- * So the extra state changing sequence don't occur.
- *
- * Thease assumption may be removed in future.
- * It needs to scan the replacement character to check
- * state changing sequences in the replacement character.
- */
+ replacement = (const unsigned char *)get_replacement_character(enc, &len);
- if (tr->resetstate_func) {
- if (*out_stop_ptr - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
- *out_pos += tr->resetstate_func(tc, *out_pos);
- }
+ if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0)
+ return 0;
- if (*out_stop_ptr - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
+ if (required_size < len)
+ return -1; /* overflow */
- replacement = get_replacement_character(enc, &len);
+ more_output_buffer(destination, resize_destination, required_size, out_start_ptr, out_pos, out_stop_ptr);
- memcpy(*out_pos, replacement, len);
+ if (rb_econv_output(ec, replacement, len, out_pos, *out_stop_ptr, &required_size) == 0)
+ return 0;
- *out_pos += len;
- return;
+ return -1;
}
#if 1
@@ -1054,8 +1088,8 @@ resume:
goto resume;
}
else if (opt&INVALID_REPLACE) {
- output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
- goto resume;
+ if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
+ goto resume;
}
rb_econv_close(ec);
rb_raise(rb_eInvalidByteSequence, "invalid byte sequence");
@@ -1068,8 +1102,8 @@ resume:
goto resume;
}
else if (opt&UNDEF_REPLACE) {
- output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
- goto resume;
+ if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
+ goto resume;
}
rb_econv_close(ec);
rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)");
@@ -1135,8 +1169,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&INVALID_REPLACE) {
- output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
- break;
+ if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
+ break;
}
rb_econv_close(ec);
rb_raise(rb_eInvalidByteSequence, "invalid byte sequence");
@@ -1150,8 +1184,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&UNDEF_REPLACE) {
- output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
- break;
+ if (output_replacement_character(destination, resize_destination, ec, &out_start, out_pos, &out_stop) == 0)
+ break;
}
rb_econv_close(ec);
rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)");
diff --git a/transcode_data.h b/transcode_data.h
index b53a1813df..69f3048124 100644
--- a/transcode_data.h
+++ b/transcode_data.h
@@ -107,8 +107,9 @@ struct rb_transcoder {
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
- int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
+ int (*resetsize_func)(rb_transcoding*); /* -> len */
+ int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
};
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);