summaryrefslogtreecommitdiff
path: root/test/ruby/enc/test_emoji_breaks.rb
blob: cdde4da9bf2d2e295909d1c149b50066c45aa562 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# frozen_string_literal: true
# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)

require "test/unit"

class TestEmojiBreaks < Test::Unit::TestCase
end

class TestEmojiBreaks::BreakTest
  attr_reader :string, :comment, :filename, :line_number, :type, :shortname

  def initialize(filename, line_number, data, comment='')
    @filename = filename
    @line_number = line_number
    @comment = comment.gsub(/\s+/, ' ').strip
    if filename=='emoji-test' or filename=='emoji-variation-sequences'
      codes, @type = data.split(/\s*;\s*/)
      @shortname = ''
    else
      codes, @type, @shortname = data.split(/\s*;\s*/)
    end
    @type = @type.gsub(/\s+/, ' ').strip
    @shortname = @shortname.gsub(/\s+/, ' ').strip
    @string = codes.split(/\s+/)
                   .map do |ch|
                          c = ch.to_i(16)
                           # eliminate cases with surrogates
                          # raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
                          c.chr('UTF-8')
                        end.join
  end
end

class TestEmojiBreaks::BreakFile
  attr_reader :basename, :fullname, :version
  FILES = []

  def initialize(basename, path, version)
    @basename = basename
    @fullname = "#{path}/#{basename}.txt" # File.expand_path(path + version, __dir__)
    @version  = version
    FILES << self
  end

  def self.files
    FILES
  end
end

class TestEmojiBreaks < Test::Unit::TestCase
  UNICODE_VERSION   = RbConfig::CONFIG['UNICODE_VERSION']
  UNICODE_DATA_PATH = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/ucd/emoji", __dir__)
  EMOJI_VERSION     = RbConfig::CONFIG['UNICODE_EMOJI_VERSION']
  EMOJI_DATA_PATH   = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__)

  EMOJI_DATA_FILES  = %w[emoji-sequences emoji-test emoji-zwj-sequences].map do |basename|
    BreakFile.new(basename, EMOJI_DATA_PATH, EMOJI_VERSION)
  end
  UNICODE_DATA_FILE = BreakFile.new('emoji-variation-sequences', UNICODE_DATA_PATH, UNICODE_VERSION[0..-3]) # [0..-3] deals with a versioning mismatch problem in Unicode
  EMOJI_DATA_FILES << UNICODE_DATA_FILE

  def self.data_files_available?
    EMOJI_DATA_FILES.all? do |f|
      File.exist?(f.fullname)
    end
  end

  def test_data_files_available
    assert_equal 4, EMOJI_DATA_FILES.size # debugging test
    unless TestEmojiBreaks.data_files_available?
      skip "Emoji data files not available in #{EMOJI_DATA_PATH}."
    end
  end
end

TestEmojiBreaks.data_files_available? and  class TestEmojiBreaks
  def read_data
    tests = []
    EMOJI_DATA_FILES.each do |file|
      version_mismatch = true
      file_tests = []
      IO.foreach(file.fullname, encoding: Encoding::UTF_8) do |line|
        line.chomp!
        raise "File Name Mismatch: line: #{line}, expected filename: #{file.basename}.txt"  if $.==1 and not line=="# #{file.basename}.txt"
        version_mismatch = false  if line =~ /^# Version: #{file.version}/
        next  if line.match?(/\A(#|\z)/)
        if line =~ /^(\h{4,6})\.\.(\h{4,6}) *(;.+)/  # deal with Unicode ranges in emoji-sequences.txt (Bug #18028)
          range_start = $1.to_i(16)
          range_end   = $2.to_i(16)
          rest        = $3
          (range_start..range_end).each do |code_point|
            file_tests << BreakTest.new(file.basename, $., *(code_point.to_s(16)+rest).split('#', 2))
          end
        else
          file_tests << BreakTest.new(file.basename, $., *line.split('#', 2))
        end
      end
      raise "File Version Mismatch: file: #{file.fullname}, version: #{file.version}"  if version_mismatch
      tests += file_tests
    end
    tests
  end

  def all_tests
    @@tests ||= read_data
  rescue Errno::ENOENT
    @@tests ||= []
  end

  def test_single_emoji
    all_tests.each do |test|
      expected = [test.string]
      actual = test.string.each_grapheme_cluster.to_a
      assert_equal expected, actual,
        "file: #{test.filename}, line #{test.line_number}, " +
        "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
    end
  end

  def test_embedded_emoji
    all_tests.each do |test|
      expected = ["\t", test.string, "\t"]
      actual = "\t#{test.string}\t".each_grapheme_cluster.to_a
      assert_equal expected, actual,
        "file: #{test.filename}, line #{test.line_number}, " +
        "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
    end
  end

  # test some pseodorandom combinations of emoji
  def test_mixed_emoji
    srand 0
    length = all_tests.length
    step =  503 # use a prime number
    all_tests.each do |test1|
      start = rand step
      start.step(by: step, to: length-1) do |t2|
        test2 = all_tests[t2]
        # exclude skin tones, because they glue to previous grapheme clusters
        next  if (0x1F3FB..0x1F3FF).include? test2.string.ord
        expected = [test1.string, test2.string]
        actual = (test1.string+test2.string).each_grapheme_cluster.to_a
        assert_equal expected, actual,
          "file1: #{test1.filename}, line1 #{test1.line_number}, " +
          "file2: #{test2.filename}, line2 #{test2.line_number},\n" +
          "type1: #{test1.type}, shortname1: #{test1.shortname}, comment1: #{test1.comment},\n" +
          "type2: #{test2.type}, shortname2: #{test2.shortname}, comment2: #{test2.comment}"
      end
    end
  end
end