From b93a2d9d2cac5d3efe72537debedb089d447d33a Mon Sep 17 00:00:00 2001 From: nagachika Date: Sun, 18 Jul 2021 11:19:13 +0900 Subject: merge revision(s) 391abc543cea118a9cd7d6310acadbfa352668ef,e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #12052] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scan the coderange in the given encoding --- ext/-test-/string/enc_str_buf_cat.c | 14 ++++++++++++++ string.c | 32 ++++++++++++++++++++++--------- test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++ 3 files changed, 46 insertions(+), 9 deletions(-) Work around issue transcoding issue with non-ASCII compatible encodings and xml escaping When using a non-ASCII compatible source and destination encoding and xml escaping (the :xml option to String#encode), the resulting string was broken, as it used the correct non-ASCII compatible encoding, but contained data that was ASCII-compatible instead of compatible with the string's encoding. Work around this issue by detecting the case where both the source and destination encoding are non-ASCII compatible, and transcoding the source string from the non-ASCII compatible encoding to UTF-8. The xml escaping code will correctly handle the UTF-8 source string and the return the correctly encoded and escaped value. Fixes [Bug #12052] Co-authored-by: Nobuyoshi Nakada --- test/ruby/test_transcode.rb | 19 +++++++++++++++++++ transcode.c | 6 ++++++ 2 files changed, 25 insertions(+) =?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?= =?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In ISO-2022-JP, the bytes use to code are the same as those for "<>". This adds regression tests to make sure that these bytes, when representing 湿, are NOT escaped with encode("ISO-2022-JP, xml: :text) or similar. These are additional regression tests for #12052. --- test/ruby/test_transcode.rb | 3 +++ 1 file changed, 3 insertions(+) --- test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'test/-ext-') diff --git a/test/-ext-/string/test_enc_str_buf_cat.rb b/test/-ext-/string/test_enc_str_buf_cat.rb index 72f903903c..b9a63ec2de 100644 --- a/test/-ext-/string/test_enc_str_buf_cat.rb +++ b/test/-ext-/string/test_enc_str_buf_cat.rb @@ -13,4 +13,13 @@ class Test_StringEncStrBufCat < Test::Unit::TestCase assert_equal(:unknown, Bug::String.new(cr_unknown_str).coderange, "an assertion for following tests") assert_equal(:valid, Bug::String.new(a8_str).enc_str_buf_cat(cr_unknown_str).coderange, Bug6509) end + + def test_str_conv_enc + str = Bug::String.new("aaa".encode("US-ASCII")) + assert_same(str, str.str_conv_enc_opts("UTF-8", "US-ASCII", 0, nil)) + + str = Bug::String.new("aaa".encode("UTF-16LE").force_encoding("UTF-8")) + assert_predicate(str, :ascii_only?) # cache coderange + assert_equal("aaa", str.str_conv_enc_opts("UTF-16LE", "UTF-8", 0, nil)) + end end -- cgit v1.2.3