From 391abc543cea118a9cd7d6310acadbfa352668ef Mon Sep 17 00:00:00 2001 From: Nobuyoshi Nakada Date: Sat, 26 Jun 2021 16:05:15 +0900 Subject: Scan the coderange in the given encoding --- ext/-test-/string/enc_str_buf_cat.c | 14 ++++++++++++++ string.c | 32 ++++++++++++++++++++++--------- test/-ext-/string/test_enc_str_buf_cat.rb | 9 +++++++++ 3 files changed, 46 insertions(+), 9 deletions(-) diff --git a/ext/-test-/string/enc_str_buf_cat.c b/ext/-test-/string/enc_str_buf_cat.c index 9ac4a298be..4c1b262e1e 100644 --- a/ext/-test-/string/enc_str_buf_cat.c +++ b/ext/-test-/string/enc_str_buf_cat.c @@ -7,8 +7,22 @@ enc_str_buf_cat(VALUE str, VALUE str2) return rb_enc_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2), rb_enc_get(str2)); } +static VALUE +str_conv_enc_opts(VALUE str, VALUE from, VALUE to, VALUE ecflags, VALUE ecopts) +{ + rb_encoding *from_enc = NIL_P(from) ? NULL : rb_to_encoding(from); + rb_encoding *to_enc = NIL_P(to) ? NULL : rb_to_encoding(to); + int flags = NUM2INT(ecflags); + if (!NIL_P(ecopts)) { + Check_Type(ecopts, T_HASH); + OBJ_FREEZE(ecopts); + } + return rb_str_conv_enc_opts(str, from_enc, to_enc, flags, ecopts); +} + void Init_string_enc_str_buf_cat(VALUE klass) { rb_define_method(klass, "enc_str_buf_cat", enc_str_buf_cat, 1); + rb_define_method(klass, "str_conv_enc_opts", str_conv_enc_opts, 4); } diff --git a/string.c b/string.c index 0bb015f38b..c183f2bd01 100644 --- a/string.c +++ b/string.c @@ -697,6 +697,18 @@ rb_enc_cr_str_exact_copy(VALUE dest, VALUE src) ENC_CODERANGE_SET(dest, ENC_CODERANGE(src)); } +static int +enc_coderange_scan(VALUE str, rb_encoding *enc, int encidx) +{ + if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && + rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { + return ENC_CODERANGE_BROKEN; + } + else { + return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc); + } +} + int rb_enc_str_coderange(VALUE str) { @@ -705,14 +717,7 @@ rb_enc_str_coderange(VALUE str) if (cr == ENC_CODERANGE_UNKNOWN) { int encidx = ENCODING_GET(str); rb_encoding *enc = rb_enc_from_index(encidx); - if (rb_enc_mbminlen(enc) > 1 && rb_enc_dummy_p(enc) && - rb_enc_mbminlen(enc = get_actual_encoding(encidx, str)) == 1) { - cr = ENC_CODERANGE_BROKEN; - } - else { - cr = coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), - enc); - } + cr = enc_coderange_scan(str, enc, encidx); ENC_CODERANGE_SET(str, cr); } return cr; @@ -954,6 +959,15 @@ static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts); +static inline bool +is_enc_ascii_string(VALUE str, rb_encoding *enc) +{ + int encidx = rb_enc_to_index(enc); + if (rb_enc_get_index(str) == encidx) + return is_ascii_string(str); + return enc_coderange_scan(str, enc, encidx) == ENC_CODERANGE_7BIT; +} + VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts) { @@ -964,7 +978,7 @@ rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, if (!to) return str; if (!from) from = rb_enc_get(str); if (from == to) return str; - if ((rb_enc_asciicompat(to) && is_ascii_string(str)) || + if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) || to == rb_ascii8bit_encoding()) { if (STR_ENC_GET(str) != to) { str = rb_str_dup(str); diff --git a/test/-ext-/string/test_enc_str_buf_cat.rb b/test/-ext-/string/test_enc_str_buf_cat.rb index 72f903903c..b9a63ec2de 100644 --- a/test/-ext-/string/test_enc_str_buf_cat.rb +++ b/test/-ext-/string/test_enc_str_buf_cat.rb @@ -13,4 +13,13 @@ class Test_StringEncStrBufCat < Test::Unit::TestCase assert_equal(:unknown, Bug::String.new(cr_unknown_str).coderange, "an assertion for following tests") assert_equal(:valid, Bug::String.new(a8_str).enc_str_buf_cat(cr_unknown_str).coderange, Bug6509) end + + def test_str_conv_enc + str = Bug::String.new("aaa".encode("US-ASCII")) + assert_same(str, str.str_conv_enc_opts("UTF-8", "US-ASCII", 0, nil)) + + str = Bug::String.new("aaa".encode("UTF-16LE").force_encoding("UTF-8")) + assert_predicate(str, :ascii_only?) # cache coderange + assert_equal("aaa", str.str_conv_enc_opts("UTF-16LE", "UTF-8", 0, nil)) + end end -- cgit v1.2.3