From 05d7d889ea4034ed2ee6fec8e9ddd6256b72065b Mon Sep 17 00:00:00 2001 From: naruse Date: Tue, 30 Nov 2010 16:47:24 +0000 Subject: * string.c (rb_str_inspect): inspect as a dummy encoding string when a UTF-16/32 (not BE/LE) string does not have a BOM. Unicode and some RFCs say that a string labeld as UTF-16/32 doesn't have a BOM, it should be considered big endian. But many Windows programs generates little endian UTF-16 strings without a BOM. So String#inspect treats a string labeled UTF-16/32 withaout a BOM as a dummy encoding string. patched by Martin Duerst. [ruby-core:33461] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29984 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- string.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'string.c') diff --git a/string.c b/string.c index 385b05f5b1..d55f943a5b 100644 --- a/string.c +++ b/string.c @@ -4214,10 +4214,22 @@ rb_str_inspect(VALUE str) p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; if (enc == utf16) { - enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE"); + const unsigned char *q = (const unsigned char *)p; + if (q[0] == 0xFE && q[1] == 0xFF) + enc = rb_enc_find("UTF-16BE"); + else if (q[0] == 0xFF && q[1] == 0xFD) + enc = rb_enc_find("UTF-16LE"); + else + unicode_p = 0; } else if (enc == utf32) { - enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE"); + const unsigned char *q = (const unsigned char *)p; + if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) + enc = rb_enc_find("UTF-32BE"); + else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) + enc = rb_enc_find("UTF-32LE"); + else + unicode_p = 0; } while (p < pend) { unsigned int c, cc; @@ -6004,7 +6016,6 @@ static VALUE rb_str_each_codepoint(VALUE str) { VALUE orig = str; - long len; int n; unsigned int c; const char *ptr, *end; @@ -6014,7 +6025,6 @@ rb_str_each_codepoint(VALUE str) RETURN_ENUMERATOR(str, 0, 0); str = rb_str_new4(str); ptr = RSTRING_PTR(str); - len = RSTRING_LEN(str); end = RSTRING_END(str); enc = STR_ENC_GET(str); while (ptr < end) { -- cgit v1.2.3