From ea1b1ea1aa98bc9488564ef18aa4032aa1cb5536 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Wed, 8 Nov 2023 14:05:52 +0100 Subject: String#force_encoding don't clear coderange if encoding is unchanged Some code out there blind calls `force_encoding` without checking what the original encoding was, which clears the coderange uselessly. If the String is big, it can be a rather costly mistake. For instance the `rack-utf8_sanitizer` gem does this on request bodies. --- string.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'string.c') diff --git a/string.c b/string.c index b3004624dd..41641c67e8 100644 --- a/string.c +++ b/string.c @@ -10843,7 +10843,23 @@ static VALUE rb_str_force_encoding(VALUE str, VALUE enc) { str_modifiable(str); - rb_enc_associate(str, rb_to_encoding(enc)); + + rb_encoding *encoding = rb_to_encoding(enc); + int idx = rb_enc_to_index(encoding); + + // If the encoding is unchanged, we do nothing. + if (ENCODING_GET(str) == idx) { + return str; + } + + rb_enc_associate_index(str, idx); + + // If the coderange was 7bit and the new encoding is ASCII-compatible + // we can keep the coderange. + if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) { + return str; + } + ENC_CODERANGE_CLEAR(str); return str; } -- cgit v1.2.3