merge revision(s) 391abc543cea118a9cd7d6310acadbfa352668ef,e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #12052]

Scan the coderange in the given encoding --- ext/-test-/string/enc_str_buf_cat.c | 14 ++++++++++++++ string.c | 32 ++++++++++++++++++++++--------- test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++ 3 files changed, 46 insertions(+), 9 deletions(-) Work around issue transcoding issue with non-ASCII compatible encodings and xml escaping When using a non-ASCII compatible source and destination encoding and xml escaping (the :xml option to String#encode), the resulting string was broken, as it used the correct non-ASCII compatible encoding, but contained data that was ASCII-compatible instead of compatible with the string's encoding. Work around this issue by detecting the case where both the source and destination encoding are non-ASCII compatible, and transcoding the source string from the non-ASCII compatible encoding to UTF-8. The xml escaping code will correctly handle the UTF-8 source string and the return the correctly encoded and escaped value. Fixes [Bug #12052] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org> --- test/ruby/test_transcode.rb | 19 +++++++++++++++++++ transcode.c | 6 ++++++ 2 files changed, 25 insertions(+) =?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?= =?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ISO-2022-JP, the bytes use to code are the same as those for "<>". This adds regression tests to make sure that these bytes, when representing 湿, are NOT　escaped with encode("ISO-2022-JP, xml: :text) or similar. These are additional regression tests for #12052. --- test/ruby/test_transcode.rb | 3 +++ 1 file changed, 3 insertions(+)
author: nagachika <nagachika@ruby-lang.org> 2021-07-18 11:19:13 +0900
committer: nagachika <nagachika@ruby-lang.org> 2021-07-18 11:19:13 +0900
commit: b93a2d9d2cac5d3efe72537debedb089d447d33a (patch)
tree: 232278aa0df215b592e05053d1081413a46590f5 /string.c
parent: ca6ebde821991cb558003c3a1d680eaa8df3169d (diff)
1 files changed, 23 insertions, 9 deletions
diff --git a/string.c b/string.c
index c2919abeee..ed101187ab 100644
--- a/string.c
+++ b/string.c
@@ -698,6 +698,18 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
     ENC_CODERANGE_SET(dest, ENC_CODERANGE(src));
 }
 
+static int
+enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx)
+{
+    if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
+	rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
+	return ENC_CODERANGE_BROKEN;
+    }
+    else {
+	return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
+    }
+}
+
 int
 rb_enc_str_coderange(VALUE str)
 {
@@ -706,14 +718,7 @@ rb_enc_str_coderange(VALUE str)
     if (cr == ENC_CODERANGE_UNKNOWN) {
 	int encidx = ENCODING_GET(str);
 	rb_encoding *enc = rb_enc_from_index(encidx);
-	if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) &&
-            rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) {
-	    cr = ENC_CODERANGE_BROKEN;
-	}
-	else {
-	    cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str),
-                                enc);
-	}
+	cr = enc_coderange_scan(str, enc, encidx);
         ENC_CODERANGE_SET(str, cr);
     }
     return cr;
@@ -955,6 +960,15 @@ static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long
 				   rb_encoding *from, rb_encoding *to,
 				   int ecflags, VALUE ecopts);
 
+static inline bool
+is_enc_ascii_string(VALUE str, rb_encoding *enc)
+{
+    int encidx = rb_enc_to_index(enc);
+    if (rb_enc_get_index(str) == encidx)
+	return is_ascii_string(str);
+    return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT;
+}
+
 VALUE
 rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
 {
@@ -965,7 +979,7 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags,
     if (!to) return str;
     if (!from) from = rb_enc_get(str);
     if (from == to) return str;
-    if ((rb_enc_asciicompat(to) && is_ascii_string(str)) ||
+    if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
 	to == rb_ascii8bit_encoding()) {
 	if (STR_ENC_GET(str) != to) {
 	    str = rb_str_dup(str);
author	nagachika <nagachika@ruby-lang.org>	2021-07-18 11:19:13 +0900
committer	nagachika <nagachika@ruby-lang.org>	2021-07-18 11:19:13 +0900
commit	b93a2d9d2cac5d3efe72537debedb089d447d33a (patch)
tree	232278aa0df215b592e05053d1081413a46590f5 /string.c
parent	ca6ebde821991cb558003c3a1d680eaa8df3169d (diff)