diff options
| author | Jean Boussier <byroot@ruby-lang.org> | 2023-11-08 14:05:52 +0100 |
|---|---|---|
| committer | Jean Boussier <jean.boussier@gmail.com> | 2023-11-09 12:38:10 +0100 |
| commit | ea1b1ea1aa98bc9488564ef18aa4032aa1cb5536 (patch) | |
| tree | 8e0428d9ddf0cf765bc06538adda3b7a98ccb0be | |
| parent | 0a7e620a36a74c4fc604f9068fb839658678f96c (diff) | |
String#force_encoding don't clear coderange if encoding is unchanged
Some code out there blind calls `force_encoding` without checking
what the original encoding was, which clears the coderange uselessly.
If the String is big, it can be a rather costly mistake.
For instance the `rack-utf8_sanitizer` gem does this on request
bodies.
| -rw-r--r-- | string.c | 18 | ||||
| -rw-r--r-- | test/objspace/test_objspace.rb | 2 | ||||
| -rw-r--r-- | test/ruby/test_process.rb | 1 |
3 files changed, 19 insertions, 2 deletions
@@ -10843,7 +10843,23 @@ static VALUE rb_str_force_encoding(VALUE str, VALUE enc) { str_modifiable(str); - rb_enc_associate(str, rb_to_encoding(enc)); + + rb_encoding *encoding = rb_to_encoding(enc); + int idx = rb_enc_to_index(encoding); + + // If the encoding is unchanged, we do nothing. + if (ENCODING_GET(str) == idx) { + return str; + } + + rb_enc_associate_index(str, idx); + + // If the coderange was 7bit and the new encoding is ASCII-compatible + // we can keep the coderange. + if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) { + return str; + } + ENC_CODERANGE_CLEAR(str); return str; } diff --git a/test/objspace/test_objspace.rb b/test/objspace/test_objspace.rb index bc6799b49f..1f1709fb76 100644 --- a/test/objspace/test_objspace.rb +++ b/test/objspace/test_objspace.rb @@ -585,7 +585,7 @@ class TestObjSpace < Test::Unit::TestCase def test_dump_string_coderange assert_includes ObjectSpace.dump("TEST STRING"), '"coderange":"7bit"' - unknown = "TEST STRING".dup.force_encoding(Encoding::BINARY) + unknown = "TEST STRING".dup.force_encoding(Encoding::UTF_16BE) 2.times do # ensure that dumping the string doesn't mutate it assert_includes ObjectSpace.dump(unknown), '"coderange":"unknown"' end diff --git a/test/ruby/test_process.rb b/test/ruby/test_process.rb index 47228d35e6..0416b20176 100644 --- a/test/ruby/test_process.rb +++ b/test/ruby/test_process.rb @@ -2747,6 +2747,7 @@ EOS require 'objspace' begin; obj = "a" * 12 + obj.force_encoding(Encoding::UTF_16LE) obj.force_encoding(Encoding::BINARY) assert_include(ObjectSpace.dump(obj), '"coderange":"unknown"') Process.warmup |
