summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-24 06:25:24 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-24 06:25:24 +0000
commitc6d291b003a4d5f8b12d9ff79767525d359b5733 (patch)
tree7bff679edc648869ce7436b6db24062af78836b0
parent51ad3df2cec27075db72dff7291c5f8828d0c9da (diff)
* include/ruby/encoding.h (rb_str_transcode): add ecflags argument.
* transcode.c (econv_opts): extracted from str_transcode. (str_transcode_enc_args): extracted from str_transcode. (str_transcode0): extracted from str_transcode. (str_transcode): use econv_opts, str_transcode_enc_args, str_transcode0. (rb_str_transcode): call str_transcode0. (econv_primitive_insert_output): give the additional argument for rb_str_transcode. * io.c (make_writeconv): use invalid/undef flags. (io_fwrite): ditto. (rb_scan_open_args): give the additional argument for rb_str_transcode. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18808 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog18
-rw-r--r--include/ruby/encoding.h2
-rw-r--r--io.c33
-rw-r--r--test/ruby/test_io_m17n.rb63
-rw-r--r--transcode.c141
5 files changed, 198 insertions, 59 deletions
diff --git a/ChangeLog b/ChangeLog
index 2997cbca62..dd9bc0d1be 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,21 @@
+Sun Aug 24 15:21:28 2008 Tanaka Akira <akr@fsij.org>
+
+ * include/ruby/encoding.h (rb_str_transcode): add ecflags argument.
+
+ * transcode.c (econv_opts): extracted from str_transcode.
+ (str_transcode_enc_args): extracted from str_transcode.
+ (str_transcode0): extracted from str_transcode.
+ (str_transcode): use econv_opts, str_transcode_enc_args,
+ str_transcode0.
+ (rb_str_transcode): call str_transcode0.
+ (econv_primitive_insert_output): give the additional argument for
+ rb_str_transcode.
+
+ * io.c (make_writeconv): use invalid/undef flags.
+ (io_fwrite): ditto.
+ (rb_scan_open_args): give the additional argument for
+ rb_str_transcode.
+
Sun Aug 24 13:27:42 2008 Tanaka Akira <akr@fsij.org>
* transcode.c (str_transcode): check last hash only if 0 < argc.
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 9336e6d346..3701b2d12f 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -194,7 +194,7 @@ rb_enc_dummy_p(rb_encoding *enc)
return ENC_DUMMY_P(enc) != 0;
}
-VALUE rb_str_transcode(VALUE str, VALUE to);
+VALUE rb_str_transcode(VALUE str, VALUE to, int ecflags);
/* econv stuff */
diff --git a/io.c b/io.c
index 702e287066..69ed01caad 100644
--- a/io.c
+++ b/io.c
@@ -699,6 +699,12 @@ make_writeconv(rb_io_t *fptr)
fptr->writeconv_initialized = 1;
ecflags = 0;
+
+ if (fptr->mode & FMODE_INVALID_MASK)
+ ecflags |= (fptr->mode / (FMODE_INVALID_MASK/ECONV_INVALID_MASK)) & ECONV_INVALID_MASK;
+ if (fptr->mode & FMODE_UNDEF_MASK)
+ ecflags |= (fptr->mode / (FMODE_UNDEF_MASK/ECONV_UNDEF_MASK)) & ECONV_UNDEF_MASK;
+
#ifdef TEXTMODE_NEWLINE_ENCODER
if (NEED_NEWLINE_ENCODER(fptr))
ecflags |= TEXTMODE_NEWLINE_ENCODER;
@@ -740,18 +746,31 @@ io_fwrite(VALUE str, rb_io_t *fptr)
long len, n, r, l, offset = 0;
if (NEED_WRITECONV(fptr)) {
+ VALUE common_encoding = Qnil;
make_writeconv(fptr);
+
if (fptr->writeconv) {
- if (!NIL_P(fptr->writeconv_stateless)) {
- str = rb_str_transcode(str, fptr->writeconv_stateless);
- }
- str = rb_econv_str_convert(fptr->writeconv, str, ECONV_PARTIAL_INPUT);
+ if (!NIL_P(fptr->writeconv_stateless))
+ common_encoding = fptr->writeconv_stateless;
}
else {
if (fptr->enc2)
- str = rb_str_transcode(str, rb_enc_from_encoding(fptr->enc2));
+ common_encoding = rb_enc_from_encoding(fptr->enc2);
else
- str = rb_str_transcode(str, rb_enc_from_encoding(fptr->enc));
+ common_encoding = rb_enc_from_encoding(fptr->enc);
+ }
+
+ if (!NIL_P(common_encoding)) {
+ int ecflags = 0;
+ if (fptr->mode & FMODE_INVALID_MASK)
+ ecflags |= (fptr->mode / (FMODE_INVALID_MASK/ECONV_INVALID_MASK)) & ECONV_INVALID_MASK;
+ if (fptr->mode & FMODE_UNDEF_MASK)
+ ecflags |= (fptr->mode / (FMODE_UNDEF_MASK/ECONV_UNDEF_MASK)) & ECONV_UNDEF_MASK;
+ str = rb_str_transcode(str, common_encoding, ecflags);
+ }
+
+ if (fptr->writeconv) {
+ str = rb_econv_str_convert(fptr->writeconv, str, ECONV_PARTIAL_INPUT);
}
}
@@ -4622,7 +4641,7 @@ rb_scan_open_args(int argc, VALUE *argv,
static VALUE fs_enc;
if (!fs_enc)
fs_enc = rb_enc_from_encoding(fs_encoding);
- fname = rb_str_transcode(fname, fs_enc);
+ fname = rb_str_transcode(fname, fs_enc, 0);
}
}
#endif
diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb
index 94be8b311f..93b6de0341 100644
--- a/test/ruby/test_io_m17n.rb
+++ b/test/ruby/test_io_m17n.rb
@@ -1202,5 +1202,68 @@ EOT
}
}
end
+
+ def test_invalid_w
+ with_tmpdir {
+ invalid_utf8 = "a\x80b".force_encoding("utf-8")
+ open("t.txt", "w:euc-jp", :invalid => :replace) {|f|
+ assert_nothing_raised { f.write invalid_utf8 }
+ }
+ assert_equal("a?b", File.read("t.txt"))
+
+ open("t.txt", "w:euc-jp", :invalid => :ignore) {|f|
+ assert_nothing_raised { f.write invalid_utf8 }
+ }
+ assert_equal("ab", File.read("t.txt"))
+
+ open("t.txt", "w:euc-jp", :undef => :replace) {|f|
+ assert_raise(Encoding::InvalidByteSequence) { f.write invalid_utf8 }
+ }
+ open("t.txt", "w:euc-jp", :undef => :ignore) {|f|
+ assert_raise(Encoding::InvalidByteSequence) { f.write invalid_utf8 }
+ }
+ }
+ end
+
+ def test_undef_w_stateless
+ with_tmpdir {
+ generate_file("t.txt", "a\uFFFDb")
+ open("t.txt", "w:euc-jp:utf-8", :undef => :replace) {|f|
+ assert_nothing_raised { f.write "a\uFFFDb" }
+ }
+ assert_equal("a?b", File.read("t.txt"))
+ open("t.txt", "w:euc-jp:utf-8", :undef => :ignore) {|f|
+ assert_nothing_raised { f.write "a\uFFFDb" }
+ }
+ assert_equal("ab", File.read("t.txt"))
+ open("t.txt", "w:euc-jp:utf-8", :invalid => :replace) {|f|
+ assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" }
+ }
+ open("t.txt", "w:euc-jp:utf-8", :invalid => :ignore) {|f|
+ assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" }
+ }
+ }
+ end
+
+ def test_undef_w_stateful
+ with_tmpdir {
+ generate_file("t.txt", "a\uFFFDb")
+ open("t.txt", "w:iso-2022-jp:utf-8", :undef => :replace) {|f|
+ assert_nothing_raised { f.write "a\uFFFDb" }
+ }
+ assert_equal("a?b", File.read("t.txt"))
+ open("t.txt", "w:iso-2022-jp:utf-8", :undef => :ignore) {|f|
+ assert_nothing_raised { f.write "a\uFFFDb" }
+ }
+ assert_equal("ab", File.read("t.txt"))
+ open("t.txt", "w:iso-2022-jp:utf-8", :invalid => :replace) {|f|
+ assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" }
+ }
+ open("t.txt", "w:iso-2022-jp:utf-8", :invalid => :ignore) {|f|
+ assert_raise(Encoding::ConversionUndefined) { f.write "a\uFFFDb" }
+ }
+ }
+ end
+
end
diff --git a/transcode.c b/transcode.c
index c087716b43..9c4b9644e3 100644
--- a/transcode.c
+++ b/transcode.c
@@ -1673,58 +1673,49 @@ str_transcoding_resize(VALUE destination, int len, int new_len)
}
static int
-str_transcode(int argc, VALUE *argv, VALUE *self)
+econv_opts(VALUE opt)
+{
+ VALUE v;
+ int options = 0;
+ v = rb_hash_aref(opt, sym_invalid);
+ if (NIL_P(v)) {
+ }
+ else if (v==sym_ignore) {
+ options |= ECONV_INVALID_IGNORE;
+ }
+ else if (v==sym_replace) {
+ options |= ECONV_INVALID_REPLACE;
+ v = rb_hash_aref(opt, sym_replace);
+ }
+ else {
+ rb_raise(rb_eArgError, "unknown value for invalid character option");
+ }
+ v = rb_hash_aref(opt, sym_undef);
+ if (NIL_P(v)) {
+ }
+ else if (v==sym_ignore) {
+ options |= ECONV_UNDEF_IGNORE;
+ }
+ else if (v==sym_replace) {
+ options |= ECONV_UNDEF_REPLACE;
+ }
+ else {
+ rb_raise(rb_eArgError, "unknown value for undefined character option");
+ }
+ return options;
+}
+
+static int
+str_transcode_enc_args(VALUE str, VALUE arg1, VALUE arg2,
+ const char **sname, rb_encoding **senc,
+ const char **dname, rb_encoding **denc)
{
- VALUE dest;
- VALUE str = *self;
- long blen, slen;
- unsigned char *buf, *bp, *sp;
- const unsigned char *fromp;
rb_encoding *from_enc, *to_enc;
const char *from_e, *to_e;
int from_encidx, to_encidx;
VALUE from_encval, to_encval;
- VALUE opt;
- int options = 0;
- if (0 < argc)
- opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
- else
- opt = Qnil;
- if (!NIL_P(opt)) {
- VALUE v;
-
- argc--;
- v = rb_hash_aref(opt, sym_invalid);
- if (NIL_P(v)) {
- }
- else if (v==sym_ignore) {
- options |= ECONV_INVALID_IGNORE;
- }
- else if (v==sym_replace) {
- options |= ECONV_INVALID_REPLACE;
- v = rb_hash_aref(opt, sym_replace);
- }
- else {
- rb_raise(rb_eArgError, "unknown value for invalid character option");
- }
- v = rb_hash_aref(opt, sym_undef);
- if (NIL_P(v)) {
- }
- else if (v==sym_ignore) {
- options |= ECONV_UNDEF_IGNORE;
- }
- else if (v==sym_replace) {
- options |= ECONV_UNDEF_REPLACE;
- }
- else {
- rb_raise(rb_eArgError, "unknown value for undefined character option");
- }
- }
- if (argc < 1 || argc > 2) {
- rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
- }
- if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) {
+ if ((to_encidx = rb_to_encoding_index(to_encval = arg1)) < 0) {
to_enc = 0;
to_encidx = 0;
to_e = StringValueCStr(to_encval);
@@ -1733,12 +1724,12 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
to_enc = rb_enc_from_index(to_encidx);
to_e = rb_enc_name(to_enc);
}
- if (argc==1) {
+ if (NIL_P(arg2)) {
from_encidx = rb_enc_get_index(str);
from_enc = rb_enc_from_index(from_encidx);
from_e = rb_enc_name(from_enc);
}
- else if ((from_encidx = rb_to_encoding_index(from_encval = argv[1])) < 0) {
+ else if ((from_encidx = rb_to_encoding_index(from_encval = arg2)) < 0) {
from_enc = 0;
from_e = StringValueCStr(from_encval);
}
@@ -1747,6 +1738,31 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
from_e = rb_enc_name(from_enc);
}
+ *sname = from_e;
+ *senc = from_enc;
+ *dname = to_e;
+ *denc = to_enc;
+ return to_encidx;
+}
+
+static int
+str_transcode0(int argc, VALUE *argv, VALUE *self, int options)
+{
+ VALUE dest;
+ VALUE str = *self;
+ long blen, slen;
+ unsigned char *buf, *bp, *sp;
+ const unsigned char *fromp;
+ rb_encoding *from_enc, *to_enc;
+ const char *from_e, *to_e;
+ int to_encidx;
+
+ if (argc < 1 || argc > 2) {
+ rb_raise(rb_eArgError, "wrong number of arguments (%d for 1..2)", argc);
+ }
+
+ to_encidx = str_transcode_enc_args(str, argv[0], argc==1 ? Qnil : argv[1], &from_e, &from_enc, &to_e, &to_enc);
+
if (from_enc && from_enc == to_enc) {
return -1;
}
@@ -1782,6 +1798,22 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
return to_encidx;
}
+static int
+str_transcode(int argc, VALUE *argv, VALUE *self)
+{
+ VALUE opt;
+ int options = 0;
+
+ if (0 < argc) {
+ opt = rb_check_convert_type(argv[argc-1], T_HASH, "Hash", "to_hash");
+ if (!NIL_P(opt)) {
+ argc--;
+ options = econv_opts(opt);
+ }
+ }
+ return str_transcode0(argc, argv, self, options);
+}
+
static inline VALUE
str_encode_associate(VALUE str, int encidx)
{
@@ -1850,9 +1882,16 @@ str_encode(int argc, VALUE *argv, VALUE str)
}
VALUE
-rb_str_transcode(VALUE str, VALUE to)
+rb_str_transcode(VALUE str, VALUE to, int flags)
{
- return str_encode(1, &to, str);
+ int argc = 1;
+ VALUE *argv = &to;
+ VALUE newstr = str;
+ int encidx = str_transcode0(argc, argv, &newstr, flags);
+
+ if (encidx < 0) return rb_str_dup(str);
+ RBASIC(newstr)->klass = rb_obj_class(str);
+ return str_encode_associate(newstr, encidx);
}
static void
@@ -2305,7 +2344,7 @@ econv_primitive_insert_output(VALUE self, VALUE string)
StringValue(string);
insert_enc = rb_econv_encoding_to_insert_output(ec);
- string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)));
+ string = rb_str_transcode(string, rb_enc_from_encoding(rb_enc_find(insert_enc)), 0);
ret = rb_econv_insert_output(ec, (const unsigned char *)RSTRING_PTR(string), RSTRING_LEN(string), insert_enc);
if (ret == -1)