summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog6
-rw-r--r--include/ruby/encoding.h6
-rw-r--r--test/ruby/test_io_m17n.rb49
-rw-r--r--transcode.c47
4 files changed, 92 insertions, 16 deletions
diff --git a/ChangeLog b/ChangeLog
index 438d502d40..df348e3934 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Tue Aug 26 21:53:56 2008 Tanaka Akira <akr@fsij.org>
+
+ * transcode.c (rb_econv_open): disable newline conversion for ASCII
+ incompatible encodings.
+ (str_transcode0): don't need disable newline conversion here.
+
Tue Aug 26 21:44:39 2008 Tanaka Akira <akr@fsij.org>
* transcode.c (rb_econv_binmode): binmode is effective only once.
diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h
index 55b5bde354..fa3ebeff1d 100644
--- a/include/ruby/encoding.h
+++ b/include/ruby/encoding.h
@@ -297,12 +297,18 @@ void rb_econv_binmode(rb_econv_t *ec);
#define ECONV_UNDEF_IGNORE 0x0010
#define ECONV_UNDEF_REPLACE 0x0020
+/* effective only if output is ascii compatible */
#define ECONV_UNIVERSAL_NEWLINE_DECODER 0x0100
+
+/* effective only if input is ascii compatible */
#define ECONV_CRLF_NEWLINE_ENCODER 0x0200
#define ECONV_CR_NEWLINE_ENCODER 0x0400
+/* end of flags for rb_econv_open */
+
/* flags for rb_econv_convert */
#define ECONV_PARTIAL_INPUT 0x10000
#define ECONV_OUTPUT_FOLLOWED_BY_INPUT 0x20000
+/* end of flags for rb_econv_convert */
#endif /* RUBY_ENCODING_H */
diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb
index 47d04a18d5..2caa2cb2b8 100644
--- a/test/ruby/test_io_m17n.rb
+++ b/test/ruby/test_io_m17n.rb
@@ -1154,8 +1154,18 @@ EOT
}
end
+ SYSTEM_NEWLINE = []
def system_newline
- File::BINARY == 0 ? "\n" : "\r\n"
+ return SYSTEM_NEWLINE.first if !SYSTEM_NEWLINE.empty?
+ with_tmpdir {
+ open("newline", "wt") {|f|
+ f.print "\n"
+ }
+ open("newline", "rb") {|f|
+ SYSTEM_NEWLINE << f.read
+ }
+ }
+ SYSTEM_NEWLINE.first
end
def test_textmode_encode_newline
@@ -1170,6 +1180,41 @@ EOT
}
end
+ def test_textmode_encode_newline_enc
+ with_tmpdir {
+ open("t.txt", "wt:euc-jp") {|f|
+ f.puts "abc\u3042"
+ f.puts "def\u3044"
+ }
+ content = File.read("t.txt", :mode=>"rb:ascii-8bit")
+ nl = system_newline
+ assert_equal("abc\xA4\xA2#{nl}def\xA4\xA4#{nl}", content)
+ }
+ end
+
+ def test_textmode_read_ascii_incompat_internal
+ with_tmpdir {
+ generate_file("t.utf8.crlf", "a\r\nb\r\n")
+ open("t.utf8.crlf", "rt:utf-8:utf-16be") {|f|
+ content = f.read
+ # textmode doesn't affect for ascii incompatible internal encoding.
+ assert_equal("\0a\0\r\0\n\0b\0\r\0\n".force_encoding("UTF-16BE"),
+ content)
+ }
+ }
+ end
+
+ def test_textmode_write_ascii_incompat_internal
+ with_tmpdir {
+ open("t.utf8.lf", "wt:utf-8:utf-16be") {|f|
+ f.print "\0a\0\n\0b\0\n".force_encoding("UTF-16BE")
+ }
+ content = File.read("t.utf8.lf", :mode=>"rb:ascii-8bit")
+ # textmode doesn't affect for ascii incompatible internal encoding.
+ assert_equal("a\nb\n", content)
+ }
+ end
+
def test_binary
with_tmpdir {
src = "a\nb\rc\r\nd\n"
@@ -1180,7 +1225,7 @@ EOT
open("t.txt", "r", :binmode=>true) {|f|
assert_equal(src, f.read)
}
- if File::BINARY == 0
+ if system_newline == "\n"
open("t.txt", "r") {|f|
assert_equal(src, f.read)
}
diff --git a/transcode.c b/transcode.c
index 108a44b880..a35db7c0cb 100644
--- a/transcode.c
+++ b/transcode.c
@@ -748,6 +748,26 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
int num_additional;
static rb_econv_t *ec;
int flags = opts ? opts->flags : 0;
+ int universal_newline_decoder_added = 0;
+
+ rb_encoding *senc, *denc;
+ int sidx, didx;
+
+ senc = NULL;
+ if (*from) {
+ sidx = rb_enc_find_index(from);
+ if (0 <= sidx) {
+ senc = rb_enc_from_index(sidx);
+ }
+ }
+
+ denc = NULL;
+ if (*to) {
+ didx = rb_enc_find_index(to);
+ if (0 <= didx) {
+ denc = rb_enc_from_index(didx);
+ }
+ }
if (*from == '\0' && *to == '\0') {
num_trans = 0;
@@ -763,7 +783,8 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
}
num_additional = 0;
- if (flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER)) {
+ if ((!*from || (senc && rb_enc_asciicompat(senc))) &&
+ (flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER))) {
const char *name = (flags & ECONV_CRLF_NEWLINE_ENCODER) ? "crlf_newline" : "cr_newline";
transcoder_entry_t *e = get_transcoder_entry("", name);
if (flags & ECONV_CRLF_NEWLINE_ENCODER)
@@ -779,8 +800,12 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
num_trans++;
num_additional++;
}
+ else {
+ flags &= ~(ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER);
+ }
- if (flags & ECONV_UNIVERSAL_NEWLINE_DECODER) {
+ if ((!*to || (denc && rb_enc_asciicompat(denc))) &&
+ (flags & ECONV_UNIVERSAL_NEWLINE_DECODER)) {
transcoder_entry_t *e = get_transcoder_entry("universal_newline", "");
if (!e) {
xfree(entries);
@@ -788,6 +813,10 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
}
entries[num_trans++] = e;
num_additional++;
+ universal_newline_decoder_added = 1;
+ }
+ else {
+ flags &= ~ECONV_UNIVERSAL_NEWLINE_DECODER;
}
ec = rb_econv_open_by_transcoder_entries(num_trans, entries);
@@ -799,6 +828,7 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
ec->opts.flags = 0;
else
ec->opts = *opts;
+ ec->opts.flags = flags;
ec->source_encoding_name = from;
ec->destination_encoding_name = to;
@@ -806,7 +836,7 @@ rb_econv_open(const char *from, const char *to, rb_econv_option_t *opts)
ec->last_tc = NULL;
ec->last_trans_index = -1;
}
- else if (flags & ECONV_UNIVERSAL_NEWLINE_DECODER) {
+ else if (universal_newline_decoder_added) {
ec->last_tc = ec->elems[ec->num_trans-2].tc;
ec->last_trans_index = ec->num_trans-2;
}
@@ -1886,17 +1916,6 @@ str_transcode0(int argc, VALUE *argv, VALUE *self, rb_econv_option_t *ecopts_arg
else
rb_econv_opts(Qnil, &ecopts);
- /* disable newline conversion for ascii incompatible encoding.
- * xxx: convert newline in ascii-compatible encoding?
- * ex. UTF-16BE -> UTF-8 -> newline conversion -> UTF-8 -> UTF-16BE.
- */
- if (!from_enc || !rb_enc_asciicompat(from_enc)) {
- ecopts.flags &= ~(ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER);
- }
- if (!to_enc || !rb_enc_asciicompat(to_enc)) {
- ecopts.flags &= ~ECONV_UNIVERSAL_NEWLINE_DECODER;
- }
-
if ((ecopts.flags & (ECONV_UNIVERSAL_NEWLINE_DECODER|
ECONV_CRLF_NEWLINE_ENCODER|
ECONV_CR_NEWLINE_ENCODER)) == 0) {