summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2010-11-30 16:47:24 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2010-11-30 16:47:24 +0000
commit05d7d889ea4034ed2ee6fec8e9ddd6256b72065b (patch)
treee6be44b5a5fbda1ccbd8d95295146055fc00059a
parent87703540b03e31d3d34a0645b2e03011471f4167 (diff)
* string.c (rb_str_inspect): inspect as a dummy encoding string
when a UTF-16/32 (not BE/LE) string does not have a BOM. Unicode and some RFCs say that a string labeld as UTF-16/32 doesn't have a BOM, it should be considered big endian. But many Windows programs generates little endian UTF-16 strings without a BOM. So String#inspect treats a string labeled UTF-16/32 withaout a BOM as a dummy encoding string. patched by Martin Duerst. [ruby-core:33461] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29984 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog11
-rw-r--r--string.c18
-rw-r--r--test/ruby/test_m17n.rb13
3 files changed, 38 insertions, 4 deletions
diff --git a/ChangeLog b/ChangeLog
index 2df7ee73a2..440ee3d9e0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+Wed Dec 1 01:29:15 2010 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * string.c (rb_str_inspect): inspect as a dummy encoding string
+ when a UTF-16/32 (not BE/LE) string does not have a BOM.
+ Unicode and some RFCs say that a string labeld as UTF-16/32
+ doesn't have a BOM, it should be considered big endian.
+ But many Windows programs generates little endian UTF-16
+ strings without a BOM. So String#inspect treats a string
+ labeled UTF-16/32 withaout a BOM as a dummy encoding string.
+ patched by Martin Duerst. [ruby-core:33461]
+
Tue Nov 30 17:04:10 2010 NARUSE, Yui <naruse@ruby-lang.org>
* addr2line.c (parse_debug_line_cu): ignore DW_LNE_set_discriminator.
diff --git a/string.c b/string.c
index 385b05f5b1..d55f943a5b 100644
--- a/string.c
+++ b/string.c
@@ -4214,10 +4214,22 @@ rb_str_inspect(VALUE str)
p = RSTRING_PTR(str); pend = RSTRING_END(str);
prev = p;
if (enc == utf16) {
- enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE");
+ const unsigned char *q = (const unsigned char *)p;
+ if (q[0] == 0xFE && q[1] == 0xFF)
+ enc = rb_enc_find("UTF-16BE");
+ else if (q[0] == 0xFF && q[1] == 0xFD)
+ enc = rb_enc_find("UTF-16LE");
+ else
+ unicode_p = 0;
}
else if (enc == utf32) {
- enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE");
+ const unsigned char *q = (const unsigned char *)p;
+ if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
+ enc = rb_enc_find("UTF-32BE");
+ else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
+ enc = rb_enc_find("UTF-32LE");
+ else
+ unicode_p = 0;
}
while (p < pend) {
unsigned int c, cc;
@@ -6004,7 +6016,6 @@ static VALUE
rb_str_each_codepoint(VALUE str)
{
VALUE orig = str;
- long len;
int n;
unsigned int c;
const char *ptr, *end;
@@ -6014,7 +6025,6 @@ rb_str_each_codepoint(VALUE str)
RETURN_ENUMERATOR(str, 0, 0);
str = rb_str_new4(str);
ptr = RSTRING_PTR(str);
- len = RSTRING_LEN(str);
end = RSTRING_END(str);
enc = STR_ENC_GET(str);
while (ptr < end) {
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb
index 3dccc76c91..cfe7ed4d3d 100644
--- a/test/ruby/test_m17n.rb
+++ b/test/ruby/test_m17n.rb
@@ -232,6 +232,19 @@ class TestM17N < Test::Unit::TestCase
Encoding.default_external = orig_ext
end
+ def test_utf_16_32_inspect
+ str = "\u3042"
+ %w/UTF-16 UTF-32/.each do |enc|
+ %w/BE LE/.each do |endian|
+ s = str.encode(enc + endian)
+ # When a UTF-16/32 string doesn't have a BOM,
+ # inspect as a dummy encoding string.
+ assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect,
+ s.dup.force_encoding(enc).inspect)
+ end
+ end
+ end
+
def test_str_dump
[
e("\xfe"),