summaryrefslogtreecommitdiff
path: root/test/ruby/enc/test_emoji_breaks.rb
blob: c96d6088f50f3f48db98c9ccf5f6f0eab13428f0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# frozen_string_literal: true
# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)

require "test/unit"

class TestEmojiBreaks < Test::Unit::TestCase
  class BreakTest
    attr_reader :string, :comment, :filename, :line_number, :type, :shortname

    def initialize(filename, line_number, data, comment='')
      @filename = filename
      @line_number = line_number
      @comment = comment.gsub(/\s+/, ' ').strip
      if filename=='emoji-test' or filename=='emoji-variation-sequences'
        codes, @type = data.split(/\s*;\s*/)
        @shortname = ''
      else
        codes, @type, @shortname = data.split(/\s*;\s*/)
      end
      @type = @type.gsub(/\s+/, ' ').strip
      @shortname = @shortname.gsub(/\s+/, ' ').strip
      @string = codes.split(/\s+/)
                     .map do |ch|
                            c = ch.to_i(16)
                             # eliminate cases with surrogates
                            # raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
                            c.chr('UTF-8')
                          end.join
    end
  end

  class BreakFile
    attr_reader :basename, :fullname, :version
    FILES = []

    def initialize(basename, path, version)
      @basename = basename
      @fullname = "#{path}/#{basename}.txt" # File.expand_path(path + version, __dir__)
      @version  = version
      FILES << self
    end

    def self.files
      FILES
    end
  end

  UNICODE_VERSION   = RbConfig::CONFIG['UNICODE_VERSION']
  UNICODE_DATA_PATH = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/ucd/emoji", __dir__)
  EMOJI_VERSION     = RbConfig::CONFIG['UNICODE_EMOJI_VERSION']
  EMOJI_DATA_PATH   = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__)

  EMOJI_DATA_FILES  = %w[emoji-sequences emoji-test emoji-zwj-sequences].map do |basename|
    BreakFile.new(basename, EMOJI_DATA_PATH, EMOJI_VERSION)
  end
  UNICODE_DATA_FILE = BreakFile.new('emoji-variation-sequences', UNICODE_DATA_PATH, UNICODE_VERSION)
  EMOJI_DATA_FILES << UNICODE_DATA_FILE

  def self.data_files_available?
    EMOJI_DATA_FILES.all? do |f|
      File.exist?(f.fullname)
    end
  end

  def test_data_files_available
    assert_equal 4, EMOJI_DATA_FILES.size # debugging test
    unless TestEmojiBreaks.data_files_available?
      omit "Emoji data files not available in #{EMOJI_DATA_PATH}."
    end
  end

  if data_files_available?
    def read_data
      tests = []
      EMOJI_DATA_FILES.each do |file|
        version_mismatch = true
        file_tests = []
        IO.foreach(file.fullname, encoding: Encoding::UTF_8) do |line|
          line.chomp!
          if $.==1
            if line=="# #{file.basename}-#{file.version}.txt"
              version_mismatch = false
            elsif line!="# #{file.basename}.txt"
              raise "File Name Mismatch: line: #{line}, expected filename: #{file.basename}.txt"
            end
          end
          version_mismatch = false  if line =~ /^# Version: #{file.version}/
          next  if line.match?(/\A(#|\z)/)
          if line =~ /^(\h{4,6})\.\.(\h{4,6}) *(;.+)/  # deal with Unicode ranges in emoji-sequences.txt (Bug #18028)
            range_start = $1.to_i(16)
            range_end   = $2.to_i(16)
            rest        = $3
            (range_start..range_end).each do |code_point|
              file_tests << BreakTest.new(file.basename, $., *(code_point.to_s(16)+rest).split('#', 2))
            end
          else
            file_tests << BreakTest.new(file.basename, $., *line.split('#', 2))
          end
        end
        raise "File Version Mismatch: file: #{file.fullname}, version: #{file.version}"  if version_mismatch
        tests += file_tests
      end
      tests
    end

    def all_tests
      @@tests ||= read_data
    rescue Errno::ENOENT
      @@tests ||= []
    end

    def test_single_emoji
      all_tests.each do |test|
        expected = [test.string]
        actual = test.string.each_grapheme_cluster.to_a
        assert_equal expected, actual,
          "file: #{test.filename}, line #{test.line_number}, " +
          "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
      end
    end

    def test_embedded_emoji
      all_tests.each do |test|
        expected = ["\t", test.string, "\t"]
        actual = "\t#{test.string}\t".each_grapheme_cluster.to_a
        assert_equal expected, actual,
          "file: #{test.filename}, line #{test.line_number}, " +
          "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
      end
    end

    # test some pseodorandom combinations of emoji
    def test_mixed_emoji
      srand 0
      length = all_tests.length
      step =  503 # use a prime number
      all_tests.each do |test1|
        start = rand step
        start.step(by: step, to: length-1) do |t2|
          test2 = all_tests[t2]
          # exclude skin tones, because they glue to previous grapheme clusters
          next  if (0x1F3FB..0x1F3FF).include? test2.string.ord
          expected = [test1.string, test2.string]
          actual = (test1.string+test2.string).each_grapheme_cluster.to_a
          assert_equal expected, actual,
            "file1: #{test1.filename}, line1 #{test1.line_number}, " +
            "file2: #{test2.filename}, line2 #{test2.line_number},\n" +
            "type1: #{test1.type}, shortname1: #{test1.shortname}, comment1: #{test1.comment},\n" +
            "type2: #{test2.type}, shortname2: #{test2.shortname}, comment2: #{test2.comment}"
        end
      end
    end
  end
end