summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog13
-rw-r--r--enc/trans/iso2022.trans3
-rw-r--r--test/ruby/test_transcode.rb3
-rw-r--r--tool/transcode-tblgen.rb2
-rw-r--r--transcode.c85
-rw-r--r--transcode_data.h1
6 files changed, 80 insertions, 27 deletions
diff --git a/ChangeLog b/ChangeLog
index 66232f81f2..6d2ca3a954 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,16 @@
+Tue Aug 12 07:41:13 2008 Tanaka Akira <akr@fsij.org>
+
+ * transcode_data.h (rb_transcoder): add resetstate_func field for
+ resetting a state of stateful encoding.
+
+ * enc/trans/iso2022.trans (rb_EUC_JP_to_ISO_2022_JP): specify
+ finish_eucjp_to_iso2022jp for resetstate_func.
+
+ * tool/transcode-tblgen.rb: specify NULL for resetstate_func.
+
+ * transcode.c (output_replacement_character): call resetstate_func
+ before appending the replacement character.
+
Tue Aug 12 07:19:24 2008 Tanaka Akira <akr@fsij.org>
* transcode.c (get_replacement_character): extracted from
diff --git a/enc/trans/iso2022.trans b/enc/trans/iso2022.trans
index 3209fad163..1d015eea54 100644
--- a/enc/trans/iso2022.trans
+++ b/enc/trans/iso2022.trans
@@ -136,7 +136,8 @@ rb_EUC_JP_to_ISO_2022_JP = {
1, /* input_unit_length */
3, /* max_input */
5, /* max_output */
- NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
+ NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp,
+ finish_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
};
void
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb
index 8664f443a0..d3bcb77c35 100644
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@@ -303,6 +303,9 @@ class TestTranscode < Test::Unit::TestCase
"\xdc\x00".encode("EUC-JP", "UTF-16BE", :invalid=>:replace), "[ruby-dev:35776]")
assert_equal("ab?cd?ef",
"\0a\0b\xdc\x00\0c\0d\xdf\x00\0e\0f".encode("EUC-JP", "UTF-16BE", :invalid=>:replace))
+
+ assert_equal("\e$B!!\e(B?".force_encoding("ISO-2022-JP"),
+ "\xA1\xA1\xFF".encode("ISO-2022-JP", "EUC-JP", invalid: :replace))
end
def test_undef_replace
diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb
index 3459bd8bf1..493e3b11a6 100644
--- a/tool/transcode-tblgen.rb
+++ b/tool/transcode-tblgen.rb
@@ -446,7 +446,7 @@ static const rb_transcoder
#{input_unit_length}, /* input_unit_length */
#{max_input}, /* max_input */
#{max_output}, /* max_output */
- NULL, NULL, NULL, NULL, NULL
+ NULL, NULL, NULL, NULL, NULL, NULL
};
End
tree_code + "\n" + transcoder_code
diff --git a/transcode.c b/transcode.c
index 9219a98cd6..1802552a41 100644
--- a/transcode.c
+++ b/transcode.c
@@ -292,19 +292,6 @@ get_replacement_character(rb_encoding *enc, int *len_ret)
}
}
-static void
-output_replacement_character(unsigned char **out_pp, rb_encoding *enc)
-{
- const char *replacement;
- int len;
- replacement = get_replacement_character(enc, &len);
-
- memcpy(*out_pp, replacement, len);
-
- *out_pp += len;
- return;
-}
-
/*
* Transcoding engine logic
*/
@@ -818,6 +805,62 @@ more_output_buffer(
*out_stop_ptr = *out_start_ptr + new_len;
}
+static void
+output_replacement_character(
+ VALUE destination,
+ unsigned char *(*resize_destination)(VALUE, int, int),
+ rb_trans_t *ts,
+ unsigned char **out_start_ptr,
+ unsigned char **out_pos,
+ unsigned char **out_stop_ptr)
+
+{
+ rb_transcoding *tc;
+ const rb_transcoder *tr;
+ int max_output;
+ rb_encoding *enc;
+ const char *replacement;
+ int len;
+
+ tc = ts->elems[ts->num_trans-1].tc;
+ tr = tc->transcoder;
+ max_output = tr->max_output;
+ enc = rb_enc_find(tr->to_encoding);
+
+ /*
+ * Assumption for stateful encoding:
+ *
+ * - The replacement character can be output on resetted state and doesn't
+ * change the state.
+ * - it is acceptable that extra state changing sequence if the replacement
+ * character contains a state changing sequence.
+ *
+ * Currently the replacement character for stateful encoding such as
+ * ISO-2022-JP is "?" and it has no state changing sequence.
+ * So the extra state changing sequence don't occur.
+ *
+ * Thease assumption may be removed in future.
+ * It needs to scan the replacement character to check
+ * state changing sequences in the replacement character.
+ */
+
+ if (tr->resetstate_func) {
+ if (*out_stop_ptr - *out_pos < max_output)
+ more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+ *out_pos += tr->resetstate_func(tc, *out_pos);
+ }
+
+ if (*out_stop_ptr - *out_pos < max_output)
+ more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr);
+
+ replacement = get_replacement_character(enc, &len);
+
+ memcpy(*out_pos, replacement, len);
+
+ *out_pos += len;
+ return;
+}
+
#if 1
static void
transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
@@ -848,9 +891,7 @@ resume:
goto resume;
}
else if (opt&INVALID_REPLACE) {
- if (out_stop - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
- output_replacement_character(out_pos, rb_enc_find(to_encoding));
+ output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
goto resume;
}
rb_trans_close(ts);
@@ -864,9 +905,7 @@ resume:
goto resume;
}
else if (opt&UNDEF_REPLACE) {
- if (out_stop - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
- output_replacement_character(out_pos, rb_enc_find(to_encoding));
+ output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
goto resume;
}
rb_trans_close(ts);
@@ -931,9 +970,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&INVALID_REPLACE) {
- if (out_stop - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
- output_replacement_character(out_pos, rb_enc_find(to_encoding));
+ output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
break;
}
rb_trans_close(ts);
@@ -948,9 +985,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&UNDEF_REPLACE) {
- if (out_stop - *out_pos < max_output)
- more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
- output_replacement_character(out_pos, rb_enc_find(to_encoding));
+ output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
break;
}
rb_trans_close(ts);
diff --git a/transcode_data.h b/transcode_data.h
index 1dc7ac8e92..24c88fc89b 100644
--- a/transcode_data.h
+++ b/transcode_data.h
@@ -95,6 +95,7 @@ struct rb_transcoder {
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */
int (*func_so)(rb_transcoding*, const unsigned char*, size_t, unsigned char*); /* start -> output */
+ int (*resetstate_func)(rb_transcoding*, unsigned char*); /* -> output */
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
};