diff options
| author | nagachika <nagachika@ruby-lang.org> | 2024-01-18 11:50:31 +0900 |
|---|---|---|
| committer | nagachika <nagachika@ruby-lang.org> | 2024-01-18 11:50:31 +0900 |
| commit | b4f8623441a8be53b643fed826ba44e933cafd7e (patch) | |
| tree | 6cfd3038569980b79c2a35cc30832c6d2d99c9d1 | |
| parent | 0cc0e43745ffc13a596441adccee295274d99a0b (diff) | |
merge revision(s) b3d612804946e841e47d14e09b6839224a79c1a4: [Backport #20150]
Fix memory leak in grapheme clusters
[Bug #20150]
String#grapheme_cluters and String#each_grapheme_cluster leaks memory
because if the string is not UTF-8, then the created regex will not
be freed.
For example:
str = "hello world".encode(Encoding::UTF_32LE)
10.times do
1_000.times do
str.grapheme_clusters
end
puts `ps -o rss= -p #{$$}`
end
Before:
26000
42256
59008
75792
92528
109232
125936
142672
159392
176160
After:
9264
9504
9808
10000
10128
10224
10352
10544
10704
10896
---
string.c | 98 +++++++++++++++++++++++++++++++-----------------
test/ruby/test_string.rb | 11 ++++++
2 files changed, 75 insertions(+), 34 deletions(-)
| -rw-r--r-- | string.c | 98 | ||||
| -rw-r--r-- | test/ruby/test_string.rb | 11 | ||||
| -rw-r--r-- | version.h | 2 |
3 files changed, 76 insertions, 35 deletions
@@ -9297,56 +9297,65 @@ static regex_t * get_reg_grapheme_cluster(rb_encoding *enc) { int encidx = rb_enc_to_index(enc); - regex_t *reg_grapheme_cluster = NULL; - static regex_t *reg_grapheme_cluster_utf8 = NULL; - /* synchronize */ - if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) { - reg_grapheme_cluster = reg_grapheme_cluster_utf8; - } - if (!reg_grapheme_cluster) { - const OnigUChar source_ascii[] = "\\X"; - OnigErrorInfo einfo; - const OnigUChar *source = source_ascii; - size_t source_len = sizeof(source_ascii) - 1; - switch (encidx) { + const OnigUChar source_ascii[] = "\\X"; + const OnigUChar *source = source_ascii; + size_t source_len = sizeof(source_ascii) - 1; + + switch (encidx) { #define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x) #define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8) #define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x) #define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16) #define CASE_UTF(e) \ - case ENCINDEX_UTF_##e: { \ - static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ - source = source_UTF_##e; \ - source_len = sizeof(source_UTF_##e); \ - break; \ - } - CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); + case ENCINDEX_UTF_##e: { \ + static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \ + source = source_UTF_##e; \ + source_len = sizeof(source_UTF_##e); \ + break; \ + } + CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE); #undef CASE_UTF #undef CHARS_16BE #undef CHARS_16LE #undef CHARS_32BE #undef CHARS_32LE - } - int r = onig_new(®_grapheme_cluster, source, source + source_len, - ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); - if (r) { - UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; - onig_error_code_to_str(message, r, &einfo); - rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message); - } - if (encidx == rb_utf8_encindex()) { - reg_grapheme_cluster_utf8 = reg_grapheme_cluster; - } } + + regex_t *reg_grapheme_cluster; + OnigErrorInfo einfo; + int r = onig_new(®_grapheme_cluster, source, source + source_len, + ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo); + if (r) { + UChar message[ONIG_MAX_ERROR_MESSAGE_LEN]; + onig_error_code_to_str(message, r, &einfo); + rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message); + } + return reg_grapheme_cluster; } +static regex_t * +get_cached_reg_grapheme_cluster(rb_encoding *enc) +{ + int encidx = rb_enc_to_index(enc); + static regex_t *reg_grapheme_cluster_utf8 = NULL; + + if (encidx == rb_utf8_encindex()) { + if (!reg_grapheme_cluster_utf8) { + reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc); + } + + return reg_grapheme_cluster_utf8; + } + + return NULL; +} + static VALUE rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) { size_t grapheme_cluster_count = 0; - regex_t *reg_grapheme_cluster = NULL; rb_encoding *enc = get_encoding(str); const char *ptr, *end; @@ -9354,7 +9363,13 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) return rb_str_length(str); } - reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + bool cached_reg_grapheme_cluster = true; + regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc); + if (!reg_grapheme_cluster) { + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + cached_reg_grapheme_cluster = false; + } + ptr = RSTRING_PTR(str); end = RSTRING_END(str); @@ -9367,6 +9382,10 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) ptr += len; } + if (!cached_reg_grapheme_cluster) { + onig_free(reg_grapheme_cluster); + } + return SIZET2NUM(grapheme_cluster_count); } @@ -9374,7 +9393,6 @@ static VALUE rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) { VALUE orig = str; - regex_t *reg_grapheme_cluster = NULL; rb_encoding *enc = get_encoding(str); const char *ptr0, *ptr, *end; @@ -9383,7 +9401,14 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) } if (!ary) str = rb_str_new_frozen(str); - reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + + bool cached_reg_grapheme_cluster = true; + regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc); + if (!reg_grapheme_cluster) { + reg_grapheme_cluster = get_reg_grapheme_cluster(enc); + cached_reg_grapheme_cluster = false; + } + ptr0 = ptr = RSTRING_PTR(str); end = RSTRING_END(str); @@ -9395,6 +9420,11 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len)); ptr += len; } + + if (!cached_reg_grapheme_cluster) { + onig_free(reg_grapheme_cluster); + } + RB_GC_GUARD(str); if (ary) return ary; diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index 7c685407d6..069813e681 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -1074,6 +1074,17 @@ CODE assert_equal("C", res[2]) end + def test_grapheme_clusters_memory_leak + assert_no_memory_leak([], "", "#{<<~"begin;"}\n#{<<~'end;'}", "[Bug #todo]", rss: true) + begin; + str = "hello world".encode(Encoding::UTF_32LE) + + 10_000.times do + str.grapheme_clusters + end + end; + end + def test_each_line verbose, $VERBOSE = $VERBOSE, nil @@ -11,7 +11,7 @@ # define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR #define RUBY_VERSION_TEENY 2 #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR -#define RUBY_PATCHLEVEL 151 +#define RUBY_PATCHLEVEL 152 #include "ruby/version.h" #include "ruby/internal/abi.h" |
