summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-26 09:03:11 +0000
committerduerst <duerst@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2018-11-26 09:03:11 +0000
commit0409290ec077770f48f2274324b9986fa49d5da7 (patch)
tree1937f998201cb7fac53af291fbf930d12af3d46a
parentebff9dc10e6e72239c23e50acc7d3cbfdc659e7a (diff)
add tests for grapheme clusters using Unicode Emoji test data
Add file test/ruby/enc/test_emoji_breaks.rb to test String#each_grapheme_cluster test data provided by Unicode (at https://www.unicode.org/Public/emoji/#{EMOJI_VERSION}/). Lines containing emoji for genies, zombies, and wrestling are ignored because there seems to be a bug (#15343) in the implementation. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@65990 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--test/ruby/enc/test_emoji_breaks.rb117
1 files changed, 117 insertions, 0 deletions
diff --git a/test/ruby/enc/test_emoji_breaks.rb b/test/ruby/enc/test_emoji_breaks.rb
new file mode 100644
index 0000000000..03c608859d
--- /dev/null
+++ b/test/ruby/enc/test_emoji_breaks.rb
@@ -0,0 +1,117 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class BreakTest
+ attr_reader :string, :comment, :filename, :line_number, :type, :shortname
+
+ def initialize (filename, line_number, data, comment='')
+ @filename = filename
+ @line_number = line_number
+ @comment = comment
+ if filename=='emoji-test'
+ codes, @type = data.split(/\s*;\s*/)
+ @shortname = ''
+ else
+ codes, @type, @shortname = data.split(/\s*;\s*/)
+ end
+ @string = codes.split(/\s+/)
+ .map do |ch|
+ c = ch.to_i(16)
+ # eliminate cases with surrogates
+ # raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+ c.chr('UTF-8')
+ end.join
+ raise ArgumentError if data.match? /genie/ or comment.match? /genie/
+ raise ArgumentError if data.match? /zombie/ or comment.match? /zombie/
+ raise ArgumentError if data.match? /wrestling/ or comment.match? /wrestling/
+ end
+end
+
+class TestEmojiBreaks < Test::Unit::TestCase
+ EMOJI_DATA_FILES = %w[emoji-sequences emoji-test emoji-variation-sequences emoji-zwj-sequences]
+ EMOJI_VERSION = '5.0' # hard-coded, should be replaced by
+ # RbConfig::CONFIG['UNICODE_EMOJI_VERSION'] or so, see feature #15341
+ EMOJI_DATA_PATH = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__)
+
+ def self.expand_filename(basename)
+ File.expand_path("#{EMOJI_DATA_PATH}/#{basename}.txt", __dir__)
+ end
+
+ def self.data_files_available?
+ EMOJI_DATA_FILES.all? do |f|
+ File.exist?(expand_filename(f))
+ end
+ end
+
+ def test_data_files_available
+ unless TestEmojiBreaks.data_files_available?
+ skip "Emoji data files not available in #{EMOJI_DATA_PATH}."
+ end
+ end
+end
+
+TestEmojiBreaks.data_files_available? and class TestEmojiBreaks
+ def read_data
+ tests = []
+ EMOJI_DATA_FILES.each do |filename|
+ version_mismatch = true
+ file_tests = []
+ IO.foreach(TestEmojiBreaks.expand_filename(filename), encoding: Encoding::UTF_8) do |line|
+ line.chomp!
+ raise "File Name Mismatch" if $.==1 and not line=="# #{filename}.txt"
+ version_mismatch = false if line=="# Version: #{EMOJI_VERSION}"
+ next if /\A(#|\z)/.match? line
+ file_tests << BreakTest.new(filename, $., *line.split('#')) rescue 'whatever'
+ end
+ raise "File Version Mismatch" if version_mismatch
+ tests += file_tests
+ end
+ tests
+ end
+
+ def all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def test_single_emoji
+ all_tests.each do |test|
+ expected = [test.string]
+ actual = test.string.each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file: #{test.filename}, line #{test.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
+ end
+ end
+
+ def test_embedded_emoji
+ all_tests.each do |test|
+ expected = ["A", test.string, "Z"]
+ actual = "A#{test.string}Z".each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file: #{test.filename}, line #{test.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
+ end
+ end
+
+ # test some pseodorandom combinations of emoji
+ def test_mixed_emoji
+ srand 0
+ length = all_tests.length
+ step = 503 # use a prime number
+ all_tests.each do |test1|
+ start = rand step
+ start.step(by: step, to: length-1) do |t2|
+ test2 = all_tests[t2]
+ expected = [test1.string, test2.string]
+ actual = (test1.string+test2.string).each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file: #{test1.filename}, line #{test1.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', type: #{test1.type}, shortname: #{test1.shortname}, comment: #{test1.comment}"
+ end
+ end
+ end
+end