summaryrefslogtreecommitdiff
path: root/transcode.c
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-07-14 06:27:26 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-07-14 06:27:26 +0000
commitef62472e44f921f5a03bf2bf8367e9291fe75a6f (patch)
tree5b4ae6e2b7fdf1f32e4922341f903b98ab7f3a1c /transcode.c
parent641d6ece1ef85af7f1bfffdaa6defc4e5a8c9228 (diff)
* transcode.c (get_replacement_character): temporary function,
get characters for replacement mode. * transcode.c (transcode_loop): add undef key and replace value. * transcode.c (str_transcode): ditto. * transcode.c (Init_transcode): define sym_undef and sym_replace. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18062 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'transcode.c')
-rw-r--r--transcode.c85
1 files changed, 80 insertions, 5 deletions
diff --git a/transcode.c b/transcode.c
index d3e59a815b..cad86e511e 100644
--- a/transcode.c
+++ b/transcode.c
@@ -15,8 +15,11 @@
#include "transcode_data.h"
#include <ctype.h>
-static VALUE sym_invalid, sym_ignore;
+static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
#define INVALID_IGNORE 0x1
+#define INVALID_REPLACE 0x2
+#define UNDEF_IGNORE 0x10
+#define UNDEF_REPLACE 0x20
/*
* Dispatch data and logic
@@ -119,6 +122,36 @@ transcode_dispatch(const char* from_encoding, const char* to_encoding)
return (rb_transcoder *)val;
}
+static const char*
+get_replacement_character(rb_encoding *enc)
+{
+ static rb_encoding *utf16be_encoding, *utf16le_encoding;
+ static rb_encoding *utf32be_encoding, *utf32le_encoding;
+ if (!utf16be_encoding) {
+ utf16be_encoding = rb_enc_find("UTF-16BE");
+ utf16le_encoding = rb_enc_find("UTF-16LE");
+ utf32be_encoding = rb_enc_find("UTF-32BE");
+ utf32le_encoding = rb_enc_find("UTF-32LE");
+ }
+ if (rb_enc_asciicompat(enc)) {
+ return "?";
+ }
+ else if (utf16be_encoding = enc) {
+ return "\x00?";
+ }
+ else if (utf16le_encoding = enc) {
+ return "?\x00";
+ }
+ else if (utf32be_encoding = enc) {
+ return "\x00\x00\x00?";
+ }
+ else if (utf32le_encoding = enc) {
+ return "?\x00\x00\x00";
+ }
+ else {
+ return "?";
+ }
+}
/*
* Transcoding engine logic
@@ -139,6 +172,7 @@ transcode_loop(unsigned char **in_pos, unsigned char **out_pos,
unsigned char next_byte;
int from_utf8 = my_transcoder->from_utf8;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
+ rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
while (in_p < in_stop) {
char_start = in_p;
next_table = conv_tree_start;
@@ -209,9 +243,7 @@ transcode_loop(unsigned char **in_pos, unsigned char **out_pos,
case INVALID:
goto invalid;
case UNDEF:
- /* todo: add code for alternate behaviors */
- rb_raise(rb_eRuntimeError /*@@@change exception*/, "conversion undefined for byte sequence (maybe invalid byte sequence)");
- continue;
+ goto undef;
}
continue;
invalid:
@@ -220,8 +252,31 @@ transcode_loop(unsigned char **in_pos, unsigned char **out_pos,
if (opt&INVALID_IGNORE) {
continue;
}
+ else if (opt&INVALID_REPLACE) {
+ const char *rep = get_replacement_character(to_encoding);
+ do {
+ *out_p++ = *rep++;
+ } while (*rep);
+ continue;
+ }
rb_raise(rb_eRuntimeError /*change exception*/, "invalid byte sequence");
continue;
+ undef:
+ /* valid character in from encoding
+ * but no related character(s) in to encoding */
+ /* todo: add more alternative behaviors */
+ if (opt&UNDEF_IGNORE) {
+ continue;
+ }
+ else if (opt&UNDEF_REPLACE) {
+ const char *rep = get_replacement_character(to_encoding);
+ do {
+ *out_p++ = *rep++;
+ } while (*rep);
+ continue;
+ }
+ rb_raise(rb_eRuntimeError /*@@@change exception*/, "conversion undefined for byte sequence (maybe invalid byte sequence)");
+ continue;
}
/* cleanup */
*in_pos = in_p;
@@ -265,11 +320,29 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
argc--;
v = rb_hash_aref(opt, sym_invalid);
if (NIL_P(v)) {
- rb_raise(rb_eArgError, "unknown value for invalid: setting");
}
else if (v==sym_ignore) {
options |= INVALID_IGNORE;
}
+ else if (v==sym_replace) {
+ options |= INVALID_REPLACE;
+ v = rb_hash_aref(opt, sym_replace);
+ }
+ else {
+ rb_raise(rb_eArgError, "unknown value for invalid: setting");
+ }
+ v = rb_hash_aref(opt, sym_undef);
+ if (NIL_P(v)) {
+ }
+ else if (v==sym_ignore) {
+ options |= UNDEF_IGNORE;
+ }
+ else if (v==sym_replace) {
+ options |= UNDEF_REPLACE;
+ }
+ else {
+ rb_raise(rb_eArgError, "unknown value for undef: setting");
+ }
}
if (argc < 1 || argc > 2) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
@@ -451,7 +524,9 @@ Init_transcode(void)
transcoder_lib_table = st_init_strcasetable();
sym_invalid = ID2SYM(rb_intern("invalid"));
+ sym_undef = ID2SYM(rb_intern("undef"));
sym_ignore = ID2SYM(rb_intern("ignore"));
+ sym_replace = ID2SYM(rb_intern("replace"));
rb_define_method(rb_cString, "encode", str_encode, -1);
rb_define_method(rb_cString, "encode!", str_encode_bang, -1);