summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-03-22 11:18:00 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-03-22 11:18:00 +0000
commitc40df5a76941d3a8c2cff46432b23401b6ffffbc (patch)
treeb726b9076de34830cfe728987b29a2a0bf13044a
parent06e42980992c3e231ba5a5c6cf9457980477d78b (diff)
merge revision(s) 62892,62893: [Backport #14363]
fix each_grapheme_cluster's size [Bug #14363] From: Hugo Peixoto <hugo.peixoto@gmail.com> Factor out get_reg_grapheme_cluster git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@62896 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--string.c64
-rw-r--r--test/ruby/test_string.rb13
-rw-r--r--version.h2
3 files changed, 59 insertions, 20 deletions
diff --git a/string.c b/string.c
index 14a6b8a9a9..15f8f718f5 100644
--- a/string.c
+++ b/string.c
@@ -8309,20 +8309,12 @@ rb_str_codepoints(VALUE str)
return rb_str_enumerate_codepoints(str, ary);
}
-static VALUE
-rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
+static regex_t *
+get_reg_grapheme_cluster(rb_encoding *enc)
{
- VALUE orig = str;
+ int encidx = rb_enc_to_index(enc);
regex_t *reg_grapheme_cluster = NULL;
static regex_t *reg_grapheme_cluster_utf8 = NULL;
- int encidx = ENCODING_GET(str);
- rb_encoding *enc = rb_enc_from_index(encidx);
- int unicode_p = rb_enc_unicode_p(enc);
- const char *ptr, *end;
-
- if (!unicode_p || single_byte_optimizable(str)) {
- return rb_str_enumerate_chars(str, ary);
- }
/* synchronize */
if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
@@ -8339,8 +8331,51 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
}
}
+ return reg_grapheme_cluster;
+}
+
+static VALUE
+rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
+{
+ size_t grapheme_cluster_count = 0;
+ regex_t *reg_grapheme_cluster = NULL;
+ rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
+ const char *ptr, *end;
+
+ if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+ return rb_str_length(str);
+ }
+
+ reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
+ ptr = RSTRING_PTR(str);
+ end = RSTRING_END(str);
+
+ while (ptr < end) {
+ OnigPosition len = onig_match(reg_grapheme_cluster,
+ (const OnigUChar *)ptr, (const OnigUChar *)end,
+ (const OnigUChar *)ptr, NULL, 0);
+ if (len <= 0) break;
+ grapheme_cluster_count++;
+ ptr += len;
+ }
+
+ return SIZET2NUM(grapheme_cluster_count);
+}
+
+static VALUE
+rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
+{
+ VALUE orig = str;
+ regex_t *reg_grapheme_cluster = NULL;
+ rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
+ const char *ptr, *end;
+
+ if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+ return rb_str_enumerate_chars(str, ary);
+ }
if (!ary) str = rb_str_new_frozen(str);
+ reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
ptr = RSTRING_PTR(str);
end = RSTRING_END(str);
@@ -8348,10 +8383,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
OnigPosition len = onig_match(reg_grapheme_cluster,
(const OnigUChar *)ptr, (const OnigUChar *)end,
(const OnigUChar *)ptr, NULL, 0);
- if (len == 0) break;
- if (len < 0) {
- break;
- }
+ if (len <= 0) break;
ENUM_ELEM(ary, rb_enc_str_new(ptr, len, enc));
ptr += len;
}
@@ -8380,7 +8412,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
static VALUE
rb_str_each_grapheme_cluster(VALUE str)
{
- RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
+ RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
return rb_str_enumerate_grapheme_clusters(str, 0);
}
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
index dd3a0349b5..f91ec297e0 100644
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@@ -980,11 +980,18 @@ CODE
"\u{1f469 200d 2764 fe0f 200d 1f469}",
].each do |g|
assert_equal [g], g.each_grapheme_cluster.to_a
+ assert_equal 1, g.each_grapheme_cluster.size
+ end
+
+ [
+ ["\u{a 308}", ["\u000A", "\u0308"]],
+ ["\u{d 308}", ["\u000D", "\u0308"]],
+ ["abc", ["a", "b", "c"]],
+ ].each do |str, grapheme_clusters|
+ assert_equal grapheme_clusters, str.each_grapheme_cluster.to_a
+ assert_equal grapheme_clusters.size, str.each_grapheme_cluster.size
end
- assert_equal ["\u000A", "\u0308"], "\u{a 308}".each_grapheme_cluster.to_a
- assert_equal ["\u000D", "\u0308"], "\u{d 308}".each_grapheme_cluster.to_a
- assert_equal ["a", "b", "c"], "abc".b.each_grapheme_cluster.to_a
s = ("x"+"\u{10ABCD}"*250000)
assert_empty(s.each_grapheme_cluster {s.clear})
end
diff --git a/version.h b/version.h
index 20b18a8916..4e0f5fc28b 100644
--- a/version.h
+++ b/version.h
@@ -1,6 +1,6 @@
#define RUBY_VERSION "2.5.1"
#define RUBY_RELEASE_DATE "2018-03-22"
-#define RUBY_PATCHLEVEL 49
+#define RUBY_PATCHLEVEL 50
#define RUBY_RELEASE_YEAR 2018
#define RUBY_RELEASE_MONTH 3