diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-03-22 07:58:38 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-03-22 07:58:38 +0000 |
commit | 41b2ef468597a120d52f3f73cda47cd284ab1f99 (patch) | |
tree | a5293314b6ab1afc29fd110f73a0cca833988c1b | |
parent | 6e0f5b8407b625a3039d93f48b56aac5695aa48e (diff) |
fix each_grapheme_cluster's size [Bug #14363]
From: Hugo Peixoto <hugo.peixoto@gmail.com>
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@62892 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | string.c | 52 | ||||
-rw-r--r-- | test/ruby/test_string.rb | 13 |
2 files changed, 61 insertions, 4 deletions
@@ -8356,6 +8356,56 @@ rb_str_codepoints(VALUE str) } static VALUE +rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) +{ + long grapheme_cluster_count = 0; + regex_t *reg_grapheme_cluster = NULL; + static regex_t *reg_grapheme_cluster_utf8 = NULL; + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx); + int unicode_p = rb_enc_unicode_p(enc); + const char *ptr, *end; + + if (!unicode_p || single_byte_optimizable(str)) { + return rb_str_length(str); + } + + /* synchronize */ + if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) { + reg_grapheme_cluster = reg_grapheme_cluster_utf8; + } + if (!reg_grapheme_cluster) { + const OnigUChar source[] = "\\X"; + int r = onig_new(®_grapheme_cluster, source, source + sizeof(source) - 1, + ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, NULL); + if (r) { + rb_bug("cannot compile grapheme cluster regexp"); + } + if (encidx == rb_utf8_encindex()) { + reg_grapheme_cluster_utf8 = reg_grapheme_cluster; + } + } + + ptr = RSTRING_PTR(str); + end = RSTRING_END(str); + + while (ptr < end) { + OnigPosition len = onig_match(reg_grapheme_cluster, + (const OnigUChar *)ptr, (const OnigUChar *)end, + (const OnigUChar *)ptr, NULL, 0); + if (len == 0) break; + if (len < 0) { + break; + } + grapheme_cluster_count++; + ptr += len; + } + RB_GC_GUARD(str); + + return LONG2NUM(grapheme_cluster_count); +} + +static VALUE rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) { VALUE orig = str; @@ -8426,7 +8476,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) static VALUE rb_str_each_grapheme_cluster(VALUE str) { - RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size); + RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size); return rb_str_enumerate_grapheme_clusters(str, 0); } diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index f71dfc7fee..fee71791a1 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -982,11 +982,18 @@ CODE "\u{1f469 200d 2764 fe0f 200d 1f469}", ].each do |g| assert_equal [g], g.each_grapheme_cluster.to_a + assert_equal 1, g.each_grapheme_cluster.size + end + + [ + ["\u{a 308}", ["\u000A", "\u0308"]], + ["\u{d 308}", ["\u000D", "\u0308"]], + ["abc", ["a", "b", "c"]], + ].each do |str, grapheme_clusters| + assert_equal grapheme_clusters, str.each_grapheme_cluster.to_a + assert_equal grapheme_clusters.size, str.each_grapheme_cluster.size end - assert_equal ["\u000A", "\u0308"], "\u{a 308}".each_grapheme_cluster.to_a - assert_equal ["\u000D", "\u0308"], "\u{d 308}".each_grapheme_cluster.to_a - assert_equal ["a", "b", "c"], "abc".b.each_grapheme_cluster.to_a s = ("x"+"\u{10ABCD}"*250000) assert_empty(s.each_grapheme_cluster {s.clear}) end |