summaryrefslogtreecommitdiff
path: root/test/ruby/enc/test_regex_casefold.rb
blob: 808ea14c90ff8361ee7c76a0c73624a003375065 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

require "test/unit"
require 'unicode_normalize/normalize'  # only for UNICODE_VERSION

class TestCaseFold < Test::Unit::TestCase

  UNICODE_VERSION = UnicodeNormalize::UNICODE_VERSION
  CaseTest = Struct.new :source, :target, :kind, :line

  def check_downcase_properties(expected, start, *flags)
    assert_equal expected, start.downcase(*flags)
    temp = start
    assert_equal expected, temp.downcase!(*flags)
    assert_equal expected, expected.downcase(*flags)
    temp = expected
    assert_nil   temp.downcase!(*flags)
  end

  def read_tests
    IO.readlines(File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/CaseFolding.txt", __dir__))
    .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] }
    .reject { |number, data| data =~ /^(#|$)/ }
    .collect do |linenumber, linedata|
      data, name = linedata.split /#\s*/
      code, kind, result, _ = data.split /;\s*/
      CaseTest.new code.to_i(16).chr('UTF-8'),
                   result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'),
                   kind, linenumber
    end.select { |test| test.kind=='C' }
  end

  def to_codepoints(string)
    string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') }
  end

  def setup
    @@tests ||= read_tests
  rescue Errno::ENOENT => e
    @@tests ||= []
    puts e.message
  end

  def self.generate_test_casefold(encoding)
    define_method "test_mbc_case_fold_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          assert_equal 5, "12345#{target}67890" =~ /#{source}/i,
              "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " +
              "(CaseFolding.txt line #{test[:line]})"
        rescue Encoding::UndefinedConversionError
        end
      end
    end

    define_method "test_get_case_fold_codes_by_str_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          assert_equal 5, "12345#{source}67890" =~ /#{target}/i,
              "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " +
              "(CaseFolding.txt line #{test[:line]}), " +
              "error may also be triggered by mbc_case_fold"
        rescue Encoding::UndefinedConversionError
        end
      end
    end

    define_method "test_apply_all_case_fold_#{encoding}" do
      @@tests.each do |test|
        begin
          source = test.source.encode encoding
          target = test.target.encode encoding
          reg = '\p{Upper}'
          regexp = Regexp.compile reg.encode(encoding)
          regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE
            assert_equal 5, "12345#{target}67890" =~ regexpi,
                "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " +
                "(CaseFolding.txt line #{test[:line]})"
        rescue Encoding::UndefinedConversionError
        end
      end
    end
  end

  def test_downcase_fold
    @@tests.each do |test|
      check_downcase_properties test.target, test.source, :fold
    end
  end

  # start with good encodings only
  generate_test_casefold 'US-ASCII'
  generate_test_casefold 'ISO-8859-1'
  generate_test_casefold 'ISO-8859-2'
  generate_test_casefold 'ISO-8859-3'
  generate_test_casefold 'ISO-8859-4'
  generate_test_casefold 'ISO-8859-5'
  generate_test_casefold 'ISO-8859-6'
  # generate_test_casefold 'ISO-8859-7'
  generate_test_casefold 'ISO-8859-8'
  generate_test_casefold 'ISO-8859-9'
  generate_test_casefold 'ISO-8859-10'
  generate_test_casefold 'ISO-8859-11'
  generate_test_casefold 'ISO-8859-13'
  generate_test_casefold 'ISO-8859-14'
  generate_test_casefold 'ISO-8859-15'
  generate_test_casefold 'ISO-8859-16'
  generate_test_casefold 'Windows-1250'
  generate_test_casefold 'Windows-1252'
  #generate_test_casefold 'EUC-JP'
end