* string.c (rb_str_inspect): inspect as a dummy encoding string

when a UTF-16/32 (not BE/LE) string does not have a BOM. Unicode and some RFCs say that a string labeld as UTF-16/32 doesn't have a BOM, it should be considered big endian. But many Windows programs generates little endian UTF-16 strings without a BOM. So String#inspect treats a string labeled UTF-16/32 withaout a BOM as a dummy encoding string. patched by Martin Duerst. [ruby-core:33461] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29984 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2010-11-30 16:47:24 +0000
committer: naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2010-11-30 16:47:24 +0000
commit: 05d7d889ea4034ed2ee6fec8e9ddd6256b72065b (patch)
tree: e6be44b5a5fbda1ccbd8d95295146055fc00059a /string.c
parent: 87703540b03e31d3d34a0645b2e03011471f4167 (diff)
1 files changed, 14 insertions, 4 deletions
diff --git a/string.c b/string.c
index 385b05f5b1..d55f943a5b 100644
--- a/string.c
+++ b/string.c
@@ -4214,10 +4214,22 @@ rb_str_inspect(VALUE str)
     p = RSTRING_PTR(str); pend = RSTRING_END(str);
     prev = p;
     if (enc == utf16) {
-	enc = *p == (char)0xFF ? rb_enc_find("UTF-16LE") : rb_enc_find("UTF-16BE");
+	const unsigned char *q = (const unsigned char *)p;
+	if (q[0] == 0xFE && q[1] == 0xFF)
+	    enc = rb_enc_find("UTF-16BE");
+	else if (q[0] == 0xFF && q[1] == 0xFD)
+	    enc = rb_enc_find("UTF-16LE");
+	else
+	    unicode_p = 0;
     }
     else if (enc == utf32) {
-	enc = *p == (char)0xFF ? rb_enc_find("UTF-32LE") : rb_enc_find("UTF-32BE");
+	const unsigned char *q = (const unsigned char *)p;
+	if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF)
+	    enc = rb_enc_find("UTF-32BE");
+	else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF)
+	    enc = rb_enc_find("UTF-32LE");
+	else
+	    unicode_p = 0;
     }
     while (p < pend) {
 	unsigned int c, cc;
@@ -6004,7 +6016,6 @@ static VALUE
 rb_str_each_codepoint(VALUE str)
 {
     VALUE orig = str;
-    long len;
     int n;
     unsigned int c;
     const char *ptr, *end;
@@ -6014,7 +6025,6 @@ rb_str_each_codepoint(VALUE str)
     RETURN_ENUMERATOR(str, 0, 0);
     str = rb_str_new4(str);
     ptr = RSTRING_PTR(str);
-    len = RSTRING_LEN(str);
     end = RSTRING_END(str);
     enc = STR_ENC_GET(str);
     while (ptr < end) {
author	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2010-11-30 16:47:24 +0000
committer	naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2010-11-30 16:47:24 +0000
commit	05d7d889ea4034ed2ee6fec8e9ddd6256b72065b (patch)
tree	e6be44b5a5fbda1ccbd8d95295146055fc00059a /string.c
parent	87703540b03e31d3d34a0645b2e03011471f4167 (diff)