diff options
author | nagachika <nagachika@ruby-lang.org> | 2021-07-18 11:19:13 +0900 |
---|---|---|
committer | nagachika <nagachika@ruby-lang.org> | 2021-07-18 11:19:13 +0900 |
commit | b93a2d9d2cac5d3efe72537debedb089d447d33a (patch) | |
tree | 232278aa0df215b592e05053d1081413a46590f5 /string.c | |
parent | ca6ebde821991cb558003c3a1d680eaa8df3169d (diff) |
merge revision(s) 391abc543cea118a9cd7d6310acadbfa352668ef,e86c1f6fc53433ef5c82ed2b7a4cc9a12c153e4c,f6539202c52a051a4e6946a318a1d9cd29002990: [Backport #12052]
Scan the coderange in the given encoding
---
ext/-test-/string/enc_str_buf_cat.c | 14 ++++++++++++++
string.c | 32 ++++++++++++++++++++++---------
test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++
3 files changed, 46 insertions(+), 9 deletions(-)
Work around issue transcoding issue with non-ASCII compatible
encodings and xml escaping
When using a non-ASCII compatible source and destination encoding
and xml escaping (the :xml option to String#encode), the resulting
string was broken, as it used the correct non-ASCII compatible
encoding, but contained data that was ASCII-compatible instead of
compatible with the string's encoding.
Work around this issue by detecting the case where both the
source and destination encoding are non-ASCII compatible, and
transcoding the source string from the non-ASCII compatible
encoding to UTF-8. The xml escaping code will correctly handle
the UTF-8 source string and the return the correctly encoded
and escaped value.
Fixes [Bug #12052]
Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
---
test/ruby/test_transcode.rb | 19 +++++++++++++++++++
transcode.c | 6 ++++++
2 files changed, 25 insertions(+)
=?UTF-8?q?-=20add=20regression=20tests=20for=20U+6E7F=20(?=
=?UTF-8?q?=E6=B9=BF)=20in=20ISO-2022-JP?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
In ISO-2022-JP, the bytes use to code are the same as those for "<>".
This adds regression tests to make sure that these bytes, when representing
湿, are NOT escaped with encode("ISO-2022-JP, xml: :text) or similar.
These are additional regression tests for #12052.
---
test/ruby/test_transcode.rb | 3 +++
1 file changed, 3 insertions(+)
Diffstat (limited to 'string.c')
-rw-r--r-- | string.c | 32 |
1 files changed, 23 insertions, 9 deletions
@@ -698,6 +698,18 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); } +static int +enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx) +{ + if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && + rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { + return ENC_CODERANGE_BROKEN; + } + else { + return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); + } +} + int rb_enc_str_coderange(VALUE str) { @@ -706,14 +718,7 @@ rb_enc_str_coderange(VALUE str) if (cr == ENC_CODERANGE_UNKNOWN) { int encidx = ENCODING_GET(str); rb_encoding *enc = rb_enc_from_index(encidx); - if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && - rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { - cr = ENC_CODERANGE_BROKEN; - } - else { - cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), - enc); - } + cr = enc_coderange_scan(str, enc, encidx); ENC_CODERANGE_SET(str, cr); } return cr; @@ -955,6 +960,15 @@ static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts); +static inline bool +is_enc_ascii_string(VALUE str, rb_encoding *enc) +{ + int encidx = rb_enc_to_index(enc); + if (rb_enc_get_index(str) == encidx) + return is_ascii_string(str); + return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT; +} + VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) { @@ -965,7 +979,7 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, if (!to) return str; if (!from) from = rb_enc_get(str); if (from == to) return str; - if ((rb_enc_asciicompat(to) && is_ascii_string(str)) || + if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) || to == rb_ascii8bit_encoding()) { if (STR_ENC_GET(str) != to) { str = rb_str_dup(str); |