summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authornagachika <nagachika@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-28 13:36:08 +0000
committernagachika <nagachika@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-28 13:36:08 +0000
commit29eae8b1e9a5142b30250d0a9cc738b4ce94eadc (patch)
treea4957d36f174339bcc73d2bf573bda40af4213ca
parentb1944e41f5f5711e3b79bf08f2b54da1d7d6890b (diff)
merge revision(s) 65954,65955,65958: [Backport #15337]
Don't use single byte optimization on grapheme clusters Unicode Text Segmentation considers CRLF as a character. [Bug #15337] add tests using Unicode test data for grapheme clusters Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster and \X extended grapheme cluster matcher in regular expressions against test data provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt). Some lines in the data file are ignored, as follows: - Lines with a surrogate, because Ruby doesn't handle these - The case of "\r\n", because there is a bug (#15337) in the implementation remove guard against bug #15337, because it is fixed git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@66073 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--string.c4
-rw-r--r--test/ruby/enc/test_grapheme_breaks.rb92
-rw-r--r--test/ruby/test_string.rb1
-rw-r--r--version.h6
4 files changed, 98 insertions, 5 deletions
diff --git a/string.c b/string.c
index 80749c22a0..56db697e3c 100644
--- a/string.c
+++ b/string.c
@@ -8342,7 +8342,7 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
const char *ptr, *end;
- if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+ if (!rb_enc_unicode_p(enc)) {
return rb_str_length(str);
}
@@ -8370,7 +8370,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str));
const char *ptr, *end;
- if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) {
+ if (!rb_enc_unicode_p(enc)) {
return rb_str_enumerate_chars(str, ary);
}
diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb
new file mode 100644
index 0000000000..7f6c776113
--- /dev/null
+++ b/test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,92 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class BreakTest
+ attr_reader :clusters, :string, :comment, :line_number
+
+ def initialize (line_number, data, comment)
+ @line_number = line_number
+ @comment = comment
+ @clusters = data.sub(/\A\s*÷\s*/, '')
+ .sub(/\s*÷\s*\z/, '')
+ .split(/\s*÷\s*/)
+ .map do |cl|
+ cl.split(/\s*×\s*/)
+ .map do |ch|
+ c = ch.to_i(16)
+ # eliminate cases with surrogates
+ raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+ c.chr('UTF-8')
+ end.join
+ end
+ @string = @clusters.join
+ end
+end
+
+class TestGraphemeBreaksFromFile < Test::Unit::TestCase
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+ UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
+ GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)
+
+ def self.file_available?
+ File.exist? GRAPHEME_BREAK_TEST_FILE
+ end
+
+ def test_data_files_available
+ unless TestGraphemeBreaksFromFile.file_available?
+ skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
+ end
+ end
+end
+
+TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile
+ def read_data
+ tests = []
+ IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
+ if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
+ raise "File Version Mismatch"
+ end
+ next if /\A#/.match? line
+ tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
+ end
+ tests
+ end
+
+ def all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def test_each_grapheme_cluster
+ all_tests.each do |test|
+ expected = test.clusters
+ actual = test.string.each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "line #{test.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', comment: #{test.comment}"
+ end
+ end
+
+ def test_backslash_X
+ all_tests.each do |test|
+ clusters = test.clusters.dup
+ string = test.string.dup
+ removals = 0
+ while string.sub!(/\A\X/, '')
+ removals += 1
+ clusters.shift
+ expected = clusters.join
+ assert_equal expected, string,
+ "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ assert_equal expected, string,
+ "line #{test.line_number}, after last removal, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ end
+end
diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb
index c37191bda9..c0c4fe451a 100644
--- a/test/ruby/test_string.rb
+++ b/test/ruby/test_string.rb
@@ -968,6 +968,7 @@ CODE
def test_each_grapheme_cluster
[
+ "\u{0D 0A}",
"\u{20 200d}",
"\u{600 600}",
"\u{600 20}",
diff --git a/version.h b/version.h
index 639c20407d..e88b6ab32e 100644
--- a/version.h
+++ b/version.h
@@ -1,10 +1,10 @@
#define RUBY_VERSION "2.5.4"
-#define RUBY_RELEASE_DATE "2018-11-15"
-#define RUBY_PATCHLEVEL 112
+#define RUBY_RELEASE_DATE "2018-11-28"
+#define RUBY_PATCHLEVEL 113
#define RUBY_RELEASE_YEAR 2018
#define RUBY_RELEASE_MONTH 11
-#define RUBY_RELEASE_DAY 15
+#define RUBY_RELEASE_DAY 28
#include "ruby/version.h"