summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-24 12:10:25 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-24 12:10:25 +0000
commit7599b3f6c66036d235ff850d4e30a7ae10693fe7 (patch)
treece9a8738690ac8c8238637fcb54d37e1344c6356
parent7850586af435f44ff97c93decc97995bbdf6bad4 (diff)
add tests using Unicode test data for grapheme clusters
Add file test/ruby/enc/test_grapheme_breaks.rb to test String#each_grapheme_cluster and \X extended grapheme cluster matcher in regular expressions against test data provided by Unicode (ucd/auxiliary/GraphemeBreakTest.txt). Some lines in the data file are ignored, as follows: - Lines with a surrogate, because Ruby doesn't handle these - The case of "\r\n", because there is a bug (#15337) in the implementation git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@65955 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--test/ruby/enc/test_grapheme_breaks.rb94
1 files changed, 94 insertions, 0 deletions
diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb
new file mode 100644
index 0000000000..5a4a8326c9
--- /dev/null
+++ b/test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,94 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class BreakTest
+ attr_reader :clusters, :string, :comment, :line_number
+
+ def initialize (line_number, data, comment)
+ @line_number = line_number
+ @comment = comment
+ @clusters = data.sub(/\A\s*÷\s*/, '')
+ .sub(/\s*÷\s*\z/, '')
+ .split(/\s*÷\s*/)
+ .map do |cl|
+ cl.split(/\s*×\s*/)
+ .map do |ch|
+ c = ch.to_i(16)
+ # eliminate cases with surrogates
+ raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+ c.chr('UTF-8')
+ end.join
+ end
+ @string = @clusters.join
+ # remove the following line once we have fixed bug #15337
+ raise ArgumentError if @string == "\r\n"
+ end
+end
+
+class TestGraphemeBreaksFromFile < Test::Unit::TestCase
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+ UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
+ GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)
+
+ def self.file_available?
+ File.exist? GRAPHEME_BREAK_TEST_FILE
+ end
+
+ def test_data_files_available
+ unless TestGraphemeBreaksFromFile.file_available?
+ skip "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
+ end
+ end
+end
+
+TestGraphemeBreaksFromFile.file_available? and class TestGraphemeBreaksFromFile
+ def read_data
+ tests = []
+ IO.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
+ if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
+ raise "File Version Mismatch"
+ end
+ next if /\A#/.match? line
+ tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
+ end
+ tests
+ end
+
+ def all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def test_each_grapheme_cluster
+ all_tests.each do |test|
+ expected = test.clusters
+ actual = test.string.each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "line #{test.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', comment: #{test.comment}"
+ end
+ end
+
+ def test_backslash_X
+ all_tests.each do |test|
+ clusters = test.clusters.dup
+ string = test.string.dup
+ removals = 0
+ while string.sub!(/\A\X/, '')
+ removals += 1
+ clusters.shift
+ expected = clusters.join
+ assert_equal expected, string,
+ "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ assert_equal expected, string,
+ "line #{test.line_number}, after last removal, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ end
+end