diff options
author | nagachika <nagachika@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-11-28 13:36:08 +0000 |
---|---|---|
committer | nagachika <nagachika@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2018-11-28 13:36:08 +0000 |
commit | 29eae8b1e9a5142b30250d0a9cc738b4ce94eadc (patch) | |
tree | a4957d36f174339bcc73d2bf573bda40af4213ca | |
parent | b1944e41f5f5711e3b79bf08f2b54da1d7d6890b (diff) |
merge revision(s) 65954,65955,65958: [Backport #15337]
Don't use single byte optimization on grapheme clusters
Unicode Text Segmentation considers CRLF as a character. [Bug #15337]
add tests using Unicode test data for grapheme clusters
Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster
and \X extended grapheme cluster matcher in regular expressions against test data
provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt).
Some lines in the data file are ignored, as follows:
- Lines with a surrogate, because Ruby doesn't handle these
- The case of "\r\n", because there is a bug (#15337) in the implementation
remove guard against bug #15337, because it is fixed
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_2_5@66073 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | string.c | 4 | ||||
-rw-r--r-- | test/ruby/enc/test_grapheme_breaks.rb | 92 | ||||
-rw-r--r-- | test/ruby/test_string.rb | 1 | ||||
-rw-r--r-- | version.h | 6 |
4 files changed, 98 insertions, 5 deletions
@@ -8342,7 +8342,7 @@ rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj) rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); const char *ptr, *end; - if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) { + if (!rb_enc_unicode_p(enc)) { return rb_str_length(str); } @@ -8370,7 +8370,7 @@ rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary) rb_encoding *enc = rb_enc_from_index(ENCODING_GET(str)); const char *ptr, *end; - if (!rb_enc_unicode_p(enc) || single_byte_optimizable(str)) { + if (!rb_enc_unicode_p(enc)) { return rb_str_enumerate_chars(str, ary); } diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb new file mode 100644 index 0000000000..7f6c776113 --- /dev/null +++ b/test/ruby/enc/test_grapheme_breaks.rb @@ -0,0 +1,92 @@ +# frozen_string_literal: true +# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class BreakTest + attr_reader :clusters, :string, :comment, :line_number + + def initialize (line_number, data, comment) + @line_number = line_number + @comment = comment + @clusters = data.sub(/\A\s*÷\s*/, '') + .sub(/\s*÷\s*\z/, '') + .split(/\s*÷\s*/) + .map do |cl| + cl.split(/\s*×\s*/) + .map do |ch| + c = ch.to_i(16) + # eliminate cases with surrogates + raise ArgumentError if 0xD800 <= c and c <= 0xDFFF + c.chr('UTF-8') + end.join + end + @string = @clusters.join + end +end + +class TestGraphemeBreaksFromFile < Test::Unit::TestCase + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) + UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path + GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__) + + def self.file_available? + File.exist? GRAPHEME_BREAK_TEST_FILE + end + + def test_data_files_available + unless TestGraphemeBreaksFromFile.file_available? + skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}." + end + end +end + +TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile + def read_data + tests = [] + IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line| + if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt") + raise "File Version Mismatch" + end + next if /\A#/.match? line + tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever' + end + tests + end + + def all_tests + @@tests ||= read_data + rescue Errno::ENOENT + @@tests ||= [] + end + + def test_each_grapheme_cluster + all_tests.each do |test| + expected = test.clusters + actual = test.string.each_grapheme_cluster.to_a + assert_equal expected, actual, + "line #{test.line_number}, expected '#{expected}', " + + "but got '#{actual}', comment: #{test.comment}" + end + end + + def test_backslash_X + all_tests.each do |test| + clusters = test.clusters.dup + string = test.string.dup + removals = 0 + while string.sub!(/\A\X/, '') + removals += 1 + clusters.shift + expected = clusters.join + assert_equal expected, string, + "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + assert_equal expected, string, + "line #{test.line_number}, after last removal, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + end +end diff --git a/test/ruby/test_string.rb b/test/ruby/test_string.rb index c37191bda9..c0c4fe451a 100644 --- a/test/ruby/test_string.rb +++ b/test/ruby/test_string.rb @@ -968,6 +968,7 @@ CODE def test_each_grapheme_cluster [ + "\u{0D 0A}", "\u{20 200d}", "\u{600 600}", "\u{600 20}", @@ -1,10 +1,10 @@ #define RUBY_VERSION "2.5.4" -#define RUBY_RELEASE_DATE "2018-11-15" -#define RUBY_PATCHLEVEL 112 +#define RUBY_RELEASE_DATE "2018-11-28" +#define RUBY_PATCHLEVEL 113 #define RUBY_RELEASE_YEAR 2018 #define RUBY_RELEASE_MONTH 11 -#define RUBY_RELEASE_DAY 15 +#define RUBY_RELEASE_DAY 28 #include "ruby/version.h" |