String#force_encoding don't clear coderange if encoding is unchanged

Some code out there blind calls `force_encoding` without checking what the original encoding was, which clears the coderange uselessly. If the String is big, it can be a rather costly mistake. For instance the `rack-utf8_sanitizer` gem does this on request bodies.
author: Jean Boussier <byroot@ruby-lang.org> 2023-11-08 14:05:52 +0100
committer: Jean Boussier <jean.boussier@gmail.com> 2023-11-09 12:38:10 +0100
commit: ea1b1ea1aa98bc9488564ef18aa4032aa1cb5536 (patch)
tree: 8e0428d9ddf0cf765bc06538adda3b7a98ccb0be /string.c
parent: 0a7e620a36a74c4fc604f9068fb839658678f96c (diff)
1 files changed, 17 insertions, 1 deletions
diff --git a/string.c b/string.c
index b3004624dd..41641c67e8 100644
--- a/string.c
+++ b/string.c
@@ -10843,7 +10843,23 @@ static VALUE
 rb_str_force_encoding(VALUE str, VALUE enc)
 {
     str_modifiable(str);
-    rb_enc_associate(str, rb_to_encoding(enc));
+
+    rb_encoding *encoding = rb_to_encoding(enc);
+    int idx = rb_enc_to_index(encoding);
+
+    // If the encoding is unchanged, we do nothing.
+    if (ENCODING_GET(str) == idx) {
+        return str;
+    }
+
+    rb_enc_associate_index(str, idx);
+
+    // If the coderange was 7bit and the new encoding is ASCII-compatible
+    // we can keep the coderange.
+    if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
+        return str;
+    }
+
     ENC_CODERANGE_CLEAR(str);
     return str;
 }
author	Jean Boussier <byroot@ruby-lang.org>	2023-11-08 14:05:52 +0100
committer	Jean Boussier <jean.boussier@gmail.com>	2023-11-09 12:38:10 +0100
commit	ea1b1ea1aa98bc9488564ef18aa4032aa1cb5536 (patch)
tree	8e0428d9ddf0cf765bc06538adda3b7a98ccb0be /string.c
parent	0a7e620a36a74c4fc604f9068fb839658678f96c (diff)