diff options
Diffstat (limited to 'test/ruby/enc')
| -rw-r--r-- | test/ruby/enc/test_big5.rb | 29 | ||||
| -rw-r--r-- | test/ruby/enc/test_case_comprehensive.rb | 306 | ||||
| -rw-r--r-- | test/ruby/enc/test_case_mapping.rb | 231 | ||||
| -rw-r--r-- | test/ruby/enc/test_case_options.rb | 81 | ||||
| -rw-r--r-- | test/ruby/enc/test_cesu8.rb | 113 | ||||
| -rw-r--r-- | test/ruby/enc/test_cp949.rb | 29 | ||||
| -rw-r--r-- | test/ruby/enc/test_emoji.rb | 443 | ||||
| -rw-r--r-- | test/ruby/enc/test_emoji_breaks.rb | 155 | ||||
| -rw-r--r-- | test/ruby/enc/test_euc_jp.rb | 25 | ||||
| -rw-r--r-- | test/ruby/enc/test_euc_kr.rb | 37 | ||||
| -rw-r--r-- | test/ruby/enc/test_euc_tw.rb | 29 | ||||
| -rw-r--r-- | test/ruby/enc/test_gb18030.rb | 127 | ||||
| -rw-r--r-- | test/ruby/enc/test_gbk.rb | 29 | ||||
| -rw-r--r-- | test/ruby/enc/test_grapheme_breaks.rb | 92 | ||||
| -rw-r--r-- | test/ruby/enc/test_iso_8859.rb | 166 | ||||
| -rw-r--r-- | test/ruby/enc/test_koi8.rb | 23 | ||||
| -rw-r--r-- | test/ruby/enc/test_regex_casefold.rb | 120 | ||||
| -rw-r--r-- | test/ruby/enc/test_shift_jis.rb | 28 | ||||
| -rw-r--r-- | test/ruby/enc/test_utf16.rb | 397 | ||||
| -rw-r--r-- | test/ruby/enc/test_utf32.rb | 162 | ||||
| -rw-r--r-- | test/ruby/enc/test_windows_1251.rb | 17 | ||||
| -rw-r--r-- | test/ruby/enc/test_windows_1252.rb | 26 |
22 files changed, 2665 insertions, 0 deletions
diff --git a/test/ruby/enc/test_big5.rb b/test/ruby/enc/test_big5.rb new file mode 100644 index 0000000000..5dcf93e8e3 --- /dev/null +++ b/test/ruby/enc/test_big5.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: false +require "test/unit" + +class TestBig5 < Test::Unit::TestCase + def s(s) + s.force_encoding("big5") + end + + def test_mbc_enc_len + assert_equal(1, s("\xa1\xa1").size) + end + + def test_mbc_to_code + assert_equal(0xa1a1, s("\xa1\xa1").ord) + end + + def test_code_to_mbc + assert_equal(s("\xa1\xa1"), 0xa1a1.chr("big5")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\xa1\xa1)\\1"), "i") + assert_match(r, s("\xa1\xa1\xa1\xa1")) + end + + def test_left_adjust_char_head + assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop) + end +end diff --git a/test/ruby/enc/test_case_comprehensive.rb b/test/ruby/enc/test_case_comprehensive.rb new file mode 100644 index 0000000000..b812b88b83 --- /dev/null +++ b/test/ruby/enc/test_case_comprehensive.rb @@ -0,0 +1,306 @@ +# frozen_string_literal: true +# Copyright © 2016 Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class TestComprehensiveCaseMapping < Test::Unit::TestCase + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) + UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path + + def self.hex2utf8(s) + s.split(' ').map { |c| c.to_i(16) }.pack('U*') + end + + def self.expand_filename(basename) + File.expand_path("#{UNICODE_DATA_PATH}/#{basename}.txt", __dir__) + end + + def self.data_files_available? + %w[UnicodeData CaseFolding SpecialCasing].all? do |f| + File.exist?(expand_filename(f)) + end + end + + def test_data_files_available + unless TestComprehensiveCaseMapping.data_files_available? + omit "Unicode data files not available in #{UNICODE_DATA_PATH}." + end + end +end + +TestComprehensiveCaseMapping.data_files_available? and class TestComprehensiveCaseMapping + (CaseTest = Struct.new(:method_name, :attributes, :first_data, :follow_data)).class_eval do + def initialize(method_name, attributes, first_data, follow_data=first_data) + super + end + end + + def self.read_data_file(filename) + File.foreach(expand_filename(filename), encoding: Encoding::ASCII_8BIT) do |line| + if $. == 1 + if filename == 'UnicodeData' + elsif line.start_with?("# #{filename}-#{UNICODE_VERSION}.txt") + else + raise "File Version Mismatch" + end + end + next if /\A(?:[\#@]|\s*\z)|Surrogate/.match?(line) + data = line.chomp.split('#')[0].split(/;\s*/, 15) + code = data[0].to_i(16).chr(Encoding::UTF_8) + yield code, data + end + end + + def self.read_data + @@codepoints = [] + + downcase = Hash.new { |h, c| c } + upcase = Hash.new { |h, c| c } + titlecase = Hash.new { |h, c| c } + casefold = Hash.new { |h, c| c } + swapcase = Hash.new { |h, c| c } + turkic_upcase = Hash.new { |h, c| upcase[c] } + turkic_downcase = Hash.new { |h, c| downcase[c] } + turkic_titlecase = Hash.new { |h, c| titlecase[c] } + turkic_swapcase = Hash.new { |h, c| swapcase[c] } + ascii_upcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? upcase[c] : c } + ascii_downcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? downcase[c] : c } + ascii_titlecase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? titlecase[c] : c } + ascii_swapcase = Hash.new { |h, c| /\A[a-z]\z/.match?(c) ? upcase[c] : (/\A[A-Z]\z/.match?(c) ? downcase[c] : c) } + + read_data_file('UnicodeData') do |code, data| + @@codepoints << code + upcase[code] = hex2utf8 data[12] unless data[12].empty? + downcase[code] = hex2utf8 data[13] unless data[13].empty? + if code>="\u1C90" and code<="\u1CBF" # exception for Georgian: use lowercase for titlecase + titlecase[code] = hex2utf8(data[13]) unless data[13].empty? + else + titlecase[code] = hex2utf8 data[14] unless data[14].empty? + end + end + read_data_file('CaseFolding') do |code, data| + casefold[code] = hex2utf8(data[2]) if data[1] =~ /^[CF]$/ + end + + read_data_file('SpecialCasing') do |code, data| + case data[4] + when '' + upcase[code] = hex2utf8 data[3] + downcase[code] = hex2utf8 data[1] + titlecase[code] = hex2utf8 data[2] + when /\Atr\s*/ + if data[4]!='tr After_I' + turkic_upcase[code] = hex2utf8 data[3] + turkic_downcase[code] = hex2utf8 data[1] + turkic_titlecase[code] = hex2utf8 data[2] + end + end + end + + @@codepoints.each do |c| + if upcase[c] != c + if downcase[c] != c + swapcase[c] = turkic_swapcase[c] = + case c + when "\u01C5" then "\u0064\u017D" + when "\u01C8" then "\u006C\u004A" + when "\u01CB" then "\u006E\u004A" + when "\u01F2" then "\u0064\u005A" + else # Greek + downcase[upcase[c][0]] + "\u0399" + end + else + swapcase[c] = upcase[c] + turkic_swapcase[c] = turkic_upcase[c] + end + else + if downcase[c] != c + swapcase[c] = downcase[c] + turkic_swapcase[c] = turkic_downcase[c] + end + end + end + + [ + CaseTest.new(:downcase, [], downcase), + CaseTest.new(:upcase, [], upcase), + CaseTest.new(:capitalize, [], titlecase, downcase), + CaseTest.new(:swapcase, [], swapcase), + CaseTest.new(:downcase, [:fold], casefold), + CaseTest.new(:upcase, [:turkic], turkic_upcase), + CaseTest.new(:downcase, [:turkic], turkic_downcase), + CaseTest.new(:capitalize, [:turkic], turkic_titlecase, turkic_downcase), + CaseTest.new(:swapcase, [:turkic], turkic_swapcase), + CaseTest.new(:upcase, [:ascii], ascii_upcase), + CaseTest.new(:downcase, [:ascii], ascii_downcase), + CaseTest.new(:capitalize, [:ascii], ascii_titlecase, ascii_downcase), + CaseTest.new(:swapcase, [:ascii], ascii_swapcase), + ] + end + + def self.all_tests + @@tests ||= read_data + rescue Errno::ENOENT + @@tests ||= [] + end + + def self.generate_unicode_case_mapping_tests(encoding) + all_tests.each do |test| + attributes = test.attributes.map(&:to_s).join '-' + attributes.prepend '_' unless attributes.empty? + define_method "test_#{encoding}_#{test.method_name}#{attributes}" do + @@codepoints.each do |code| + source = code.encode(encoding) * 5 + target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding) + result = source.__send__(test.method_name, *test.attributes) + assert_equal target, result, + proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"} + end + end + end + end + + def self.generate_single_byte_case_mapping_tests(encoding) + all_tests + # precalculate codepoints to speed up testing for small encodings + codepoints = [] + (0..255).each do |cp| + begin + codepoints << cp.chr(encoding).encode('UTF-8') + rescue Encoding::UndefinedConversionError, RangeError + end + end + all_tests.each do |test| + attributes = test.attributes.map(&:to_s).join '-' + attributes.prepend '_' unless attributes.empty? + define_method "test_#{encoding}_#{test.method_name}#{attributes}" do + codepoints.each do |code| + begin + source = code.encode(encoding) * 5 + begin + target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding) + rescue Encoding::UndefinedConversionError + if test.first_data[code]=="i\u0307" or test.follow_data[code]=="i\u0307" # explicit dot above + first_data = test.first_data[code]=="i\u0307" ? 'i' : test.first_data[code] + follow_data = test.follow_data[code]=="i\u0307" ? 'i' : test.follow_data[code] + target = "#{first_data}#{follow_data*4}".encode(encoding) + elsif code =~ /i|I/ # special case for Turkic + raise + else + target = source + end + end + result = source.send(test.method_name, *test.attributes) + assert_equal target, result, + proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"} + rescue Encoding::UndefinedConversionError + end + end + end + end + end + + # test for encodings that don't yet (or will never) deal with non-ASCII characters + def self.generate_ascii_only_case_mapping_tests(encoding) + all_tests + # preselect codepoints to speed up testing for small encodings + codepoints = @@codepoints.select do |code| + begin + code.encode(encoding) + true + rescue Encoding::UndefinedConversionError + false + end + end + define_method "test_#{encoding}_upcase" do + codepoints.each do |code| + begin + source = code.encode(encoding) * 5 + target = source.tr 'a-z', 'A-Z' + result = source.upcase + assert_equal target, result, + "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}" + rescue Encoding::UndefinedConversionError + end + end + end + define_method "test_#{encoding}_downcase" do + codepoints.each do |code| + begin + source = code.encode(encoding) * 5 + target = source.tr 'A-Z', 'a-z' + result = source.downcase + assert_equal target, result, + "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}" + rescue Encoding::UndefinedConversionError + end + end + end + define_method "test_#{encoding}_capitalize" do + codepoints.each do |code| + begin + source = code.encode(encoding) * 5 + target = source[0].tr('a-z', 'A-Z') + source[1..-1].tr('A-Z', 'a-z') + result = source.capitalize + assert_equal target, result, + "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}" + rescue Encoding::UndefinedConversionError + end + end + end + define_method "test_#{encoding}_swapcase" do + codepoints.each do |code| + begin + source = code.encode(encoding) * 5 + target = source.tr('a-zA-Z', 'A-Za-z') + result = source.swapcase + assert_equal target, result, + "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}" + rescue Encoding::UndefinedConversionError + end + end + end + end + + generate_single_byte_case_mapping_tests 'US-ASCII' + generate_single_byte_case_mapping_tests 'ASCII-8BIT' + generate_single_byte_case_mapping_tests 'ISO-8859-1' + generate_single_byte_case_mapping_tests 'ISO-8859-2' + generate_single_byte_case_mapping_tests 'ISO-8859-3' + generate_single_byte_case_mapping_tests 'ISO-8859-4' + generate_single_byte_case_mapping_tests 'ISO-8859-5' + generate_single_byte_case_mapping_tests 'ISO-8859-6' + generate_single_byte_case_mapping_tests 'ISO-8859-7' + generate_single_byte_case_mapping_tests 'ISO-8859-8' + generate_single_byte_case_mapping_tests 'ISO-8859-9' + generate_single_byte_case_mapping_tests 'ISO-8859-10' + generate_single_byte_case_mapping_tests 'ISO-8859-11' + generate_single_byte_case_mapping_tests 'ISO-8859-13' + generate_single_byte_case_mapping_tests 'ISO-8859-14' + generate_single_byte_case_mapping_tests 'ISO-8859-15' + generate_single_byte_case_mapping_tests 'ISO-8859-16' + generate_ascii_only_case_mapping_tests 'KOI8-R' + generate_ascii_only_case_mapping_tests 'KOI8-U' + generate_ascii_only_case_mapping_tests 'Big5' + generate_ascii_only_case_mapping_tests 'EUC-JP' + generate_ascii_only_case_mapping_tests 'EUC-KR' + generate_ascii_only_case_mapping_tests 'GB18030' + generate_ascii_only_case_mapping_tests 'GB2312' + generate_ascii_only_case_mapping_tests 'GBK' + generate_ascii_only_case_mapping_tests 'Shift_JIS' + generate_ascii_only_case_mapping_tests 'Windows-31J' + generate_single_byte_case_mapping_tests 'Windows-1250' + generate_single_byte_case_mapping_tests 'Windows-1251' + generate_single_byte_case_mapping_tests 'Windows-1252' + generate_single_byte_case_mapping_tests 'Windows-1253' + generate_single_byte_case_mapping_tests 'Windows-1254' + generate_single_byte_case_mapping_tests 'Windows-1255' + generate_ascii_only_case_mapping_tests 'Windows-1256' + generate_single_byte_case_mapping_tests 'Windows-1257' + generate_unicode_case_mapping_tests 'UTF-8' + generate_unicode_case_mapping_tests 'UTF-16BE' + generate_unicode_case_mapping_tests 'UTF-16LE' + generate_unicode_case_mapping_tests 'UTF-32BE' + generate_unicode_case_mapping_tests 'UTF-32LE' +end diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb new file mode 100644 index 0000000000..a7d1ed0d16 --- /dev/null +++ b/test/ruby/enc/test_case_mapping.rb @@ -0,0 +1,231 @@ +# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +# preliminary tests, using as a guard +# to test new implementation strategy +class TestCaseMappingPreliminary < Test::Unit::TestCase + # checks, including idempotence and non-modification; not always guaranteed + def check_upcase_properties(expected, start, *flags) + assert_equal expected, start.upcase(*flags) + temp = start.dup + assert_equal expected, temp.upcase!(*flags) unless expected==temp + assert_equal nil, temp.upcase!(*flags) if expected==temp + assert_equal expected, expected.upcase(*flags) + temp = expected.dup + assert_nil temp.upcase!(*flags) + end + + def check_downcase_properties(expected, start, *flags) + assert_equal expected, start.downcase(*flags) + temp = start.dup + assert_equal expected, temp.downcase!(*flags) unless expected==temp + assert_equal nil, temp.downcase!(*flags) if expected==temp + assert_equal expected, expected.downcase(*flags) + temp = expected.dup + assert_nil temp.downcase!(*flags) + end + + def check_capitalize_properties(expected, start, *flags) + assert_equal expected, start.capitalize(*flags) + temp = start.dup + assert_equal expected, temp.capitalize!(*flags) unless expected==temp + assert_equal nil, temp.capitalize!(*flags) if expected==temp + assert_equal expected, expected.capitalize(*flags) + temp = expected.dup + assert_nil temp.capitalize!(*flags) + end + + def check_capitalize_suffixes(lower, upper) + while upper.length > 1 + lower = lower[1..-1] + check_capitalize_properties upper[0]+lower, upper + upper = upper[1..-1] + end + end + + # different properties; careful: roundtrip isn't always guaranteed + def check_swapcase_properties(expected, start, *flags) + assert_equal expected, start.swapcase(*flags) + temp = +start + assert_equal expected, temp.swapcase!(*flags) + assert_equal start, start.swapcase(*flags).swapcase(*flags) + assert_equal expected, expected.swapcase(*flags).swapcase(*flags) + end + + def test_ascii + check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)' + check_upcase_properties 'YUKIHIRO MATSUMOTO (MATZ)', 'yukihiro matsumoto (matz)' + check_capitalize_properties 'Yukihiro matsumoto (matz)', 'yukihiro MATSUMOTO (MATZ)' + check_swapcase_properties 'yUKIHIRO matsumoto (MAtz)', 'Yukihiro MATSUMOTO (maTZ)' + end + + def test_invalid + assert_raise(ArgumentError, "Should not be possible to upcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').upcase } + assert_raise(ArgumentError, "Should not be possible to downcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').downcase } + assert_raise(ArgumentError, "Should not be possible to capitalize invalid string.") { "\xEB".dup.force_encoding('UTF-8').capitalize } + assert_raise(ArgumentError, "Should not be possible to swapcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').swapcase } + end + + def test_general + check_downcase_properties 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ' + check_upcase_properties 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ', 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ' + check_capitalize_suffixes 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ' + check_swapcase_properties 'résumé DÜRST ĭñŧėřŊÃŢIJŇŐŃæłĩżàťïōņ', 'RÉSUMÉ dürst ĬÑŦĖŘŋãţijňőńÆŁĨŻÀŤÏŌŅ' + end + + def test_one_way_upcase + check_upcase_properties 'ΜΜΜΜΜ', 'µµµµµ' # MICRO SIGN -> Greek Mu + check_downcase_properties 'µµµµµ', 'µµµµµ' # MICRO SIGN -> Greek Mu + check_capitalize_properties 'Μµµµµ', 'µµµµµ' # MICRO SIGN -> Greek Mu + check_capitalize_properties 'Μµµµµ', 'µµµµµ', :turkic # MICRO SIGN -> Greek Mu + check_capitalize_properties 'H̱ẖẖẖẖ', 'ẖẖẖẖẖ' + check_capitalize_properties 'Βϐϐϐϐ', 'ϐϐϐϐϐ' + check_capitalize_properties 'Θϑϑϑϑ', 'ϑϑϑϑϑ' + check_capitalize_properties 'Φϕ', 'ϕϕ' + check_capitalize_properties 'Πϖ', 'ϖϖ' + check_capitalize_properties 'Κϰ', 'ϰϰ' + check_capitalize_properties 'Ρϱϱ', 'ϱϱϱ' + check_capitalize_properties 'Εϵ', 'ϵϵ' + check_capitalize_properties 'Ιͅͅͅͅ', 'ͅͅͅͅͅ' + check_capitalize_properties 'Sſſſſ', 'ſſſſſ' + end + + def test_various + check_upcase_properties 'Μ', 'µ' # MICRO SIGN -> Greek Mu + check_downcase_properties 'µµµµµ', 'µµµµµ' # MICRO SIGN + check_capitalize_properties 'Ss', 'ß' + check_upcase_properties 'SS', 'ß' + end + + def test_cherokee + check_downcase_properties "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ' + check_upcase_properties 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79" + check_capitalize_suffixes "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ' + assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ'.downcase(:fold) + assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79".downcase(:fold) + end + + def test_titlecase + check_downcase_properties 'dz dž lj nj', 'Dz Dž Lj Nj' + check_downcase_properties 'dz dž lj nj', 'DZ DŽ LJ NJ' + check_upcase_properties 'DZ DŽ LJ NJ', 'Dz Dž Lj Nj' + check_upcase_properties 'DZ DŽ LJ NJ', 'dz dž lj nj' + check_capitalize_properties 'Dz', 'DZ' + check_capitalize_properties 'Dž', 'DŽ' + check_capitalize_properties 'Lj', 'LJ' + check_capitalize_properties 'Nj', 'NJ' + check_capitalize_properties 'Dz', 'dz' + check_capitalize_properties 'Dž', 'dž' + check_capitalize_properties 'Lj', 'lj' + check_capitalize_properties 'Nj', 'nj' + end + + def test_swapcase + assert_equal 'dZ', 'Dz'.swapcase + assert_equal 'dŽ', 'Dž'.swapcase + assert_equal 'lJ', 'Lj'.swapcase + assert_equal 'nJ', 'Nj'.swapcase + assert_equal 'ἀΙ', 'ᾈ'.swapcase + assert_equal 'ἣΙ', 'ᾛ'.swapcase + assert_equal 'ὧΙ', 'ᾯ'.swapcase + assert_equal 'αΙ', 'ᾼ'.swapcase + assert_equal 'ηΙ', 'ῌ'.swapcase + assert_equal 'ωΙ', 'ῼ'.swapcase + end + + def test_ascii_option + check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :ascii + check_upcase_properties 'YUKIHIRO MATSUMOTO (MATZ)', 'yukihiro matsumoto (matz)', :ascii + check_capitalize_properties 'Yukihiro matsumoto (matz)', 'yukihiro MATSUMOTO (MATZ)', :ascii + check_swapcase_properties 'yUKIHIRO matsumoto (MAtz)', 'Yukihiro MATSUMOTO (maTZ)', :ascii + check_downcase_properties 'yukİhİro matsumoto (matz)', 'YUKİHİRO MATSUMOTO (MATZ)', :ascii + check_downcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii + check_swapcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii + end + + def test_fold_option + check_downcase_properties 'ss', 'ß', :fold + check_downcase_properties 'fifl', 'fifl', :fold + check_downcase_properties 'σ', 'ς', :fold + check_downcase_properties 'μ', 'µ', :fold # MICRO SIGN -> Greek mu + end + + def test_turcic + check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :turkic + check_upcase_properties 'YUKİHİRO MATSUMOTO (MATZ)', 'Yukihiro Matsumoto (matz)', :turkic + check_downcase_properties "yuki\u0307hi\u0307ro matsumoto (matz)", 'YUKİHİRO MATSUMOTO (MATZ)' + end + + def test_greek + check_downcase_properties 'αβγδεζηθικλμνξοπρστυφχψω', 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ' + check_upcase_properties 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ', 'αβγδεζηθικλμνξοπρστυφχψω' + end + + # This test checks against problems when changing the order of mapping results + # in some of the entries of the unfolding table (related to + # https://bugs.ruby-lang.org/issues/12990). + def test_reorder_unfold + # GREEK SMALL LETTER IOTA + assert_equal 0, "\u03B9" =~ /\u0345/i + assert_equal 0, "\u0345" =~ /\u03B9/i + assert_equal 0, "\u03B9" =~ /\u0399/i + assert_equal 0, "\u0399" =~ /\u03B9/i + assert_equal 0, "\u03B9" =~ /\u1fbe/i + assert_equal 0, "\u1fbe" =~ /\u03B9/i + + # GREEK SMALL LETTER MU + assert_equal 0, "\u03BC" =~ /\u00B5/i + assert_equal 0, "\u00B5" =~ /\u03BC/i + assert_equal 0, "\u03BC" =~ /\u039C/i + assert_equal 0, "\u039C" =~ /\u03BC/i + + # CYRILLIC SMALL LETTER MONOGRAPH UK + assert_equal 0, "\uA64B" =~ /\u1c88/i + assert_equal 0, "\u1c88" =~ /\uA64B/i + assert_equal 0, "\uA64B" =~ /\ua64A/i + assert_equal 0, "\ua64A" =~ /\uA64B/i + end + + def test_georgian_canary + message = "Reexamine implementation of Georgian in String#capitalize" + assert_equal false, "\u1CBB".match?(/\p{assigned}/), message + assert_equal false, "\u1CBC".match?(/\p{assigned}/), message + end + + def test_georgian_unassigned + message = "Unassigned codepoints should not be converted" + assert_equal "\u1CBB", "\u1CBB".capitalize, message + assert_equal "\u1CBC", "\u1CBC".capitalize, message + end + + def test_georgian_capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u1C92".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u10D2".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u1C92".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u10D2".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u1C92".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u10D2".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u1C92".capitalize + assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u10D2".capitalize + end + + def test_shift_jis_downcase_ascii + s = ("A".."Z").map {|c| "\x89#{c}"}.join("").force_encoding("Shift_JIS") + assert_equal s, s.downcase(:ascii) + end + + def test_shift_jis_upcase_ascii + s = ("a".."z").map {|c| "\x89#{c}"}.join("").force_encoding("Shift_JIS") + assert_equal s, s.upcase(:ascii) + end + + def no_longer_a_test_buffer_allocations + assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic) + assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic) + assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic) + assert_equal 'TURKISH*ı'*10_000, ('I'*10_000).downcase(:turkic) + assert_equal 'TURKISH*ı'*100_000, ('I'*100_000).downcase(:turkic) + assert_equal 'TURKISH*ı'*1_000_000, ('I'*1_000_000).downcase(:turkic) + end +end diff --git a/test/ruby/enc/test_case_options.rb b/test/ruby/enc/test_case_options.rb new file mode 100644 index 0000000000..e9c81d804e --- /dev/null +++ b/test/ruby/enc/test_case_options.rb @@ -0,0 +1,81 @@ +# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class TestCaseOptions < Test::Unit::TestCase + def assert_raise_functional_operations(arg, *options) + assert_raise(ArgumentError) { arg.upcase(*options) } + assert_raise(ArgumentError) { arg.downcase(*options) } + assert_raise(ArgumentError) { arg.capitalize(*options) } + assert_raise(ArgumentError) { arg.swapcase(*options) } + end + + def assert_raise_bang_operations(arg, *options) + assert_raise(ArgumentError) { arg.upcase!(*options) } + assert_raise(ArgumentError) { arg.downcase!(*options) } + assert_raise(ArgumentError) { arg.capitalize!(*options) } + assert_raise(ArgumentError) { arg.swapcase!(*options) } + end + + def assert_raise_both_types(*options) + assert_raise_functional_operations 'a', *options + assert_raise_bang_operations(+'a', *options) + assert_raise_functional_operations :a, *options + end + + def test_option_errors + assert_raise_both_types :invalid + assert_raise_both_types :lithuanian, :turkic, :fold + assert_raise_both_types :fold, :fold + assert_raise_both_types :ascii, :fold + assert_raise_both_types :fold, :ascii + assert_raise_both_types :ascii, :turkic + assert_raise_both_types :turkic, :ascii + assert_raise_both_types :ascii, :lithuanian + assert_raise_both_types :lithuanian, :ascii + end + + def assert_okay_functional_operations(arg, *options) + assert_nothing_raised { arg.upcase(*options) } + assert_nothing_raised { arg.downcase(*options) } + assert_nothing_raised { arg.capitalize(*options) } + assert_nothing_raised { arg.swapcase(*options) } + end + + def assert_okay_bang_operations(arg, *options) + assert_nothing_raised { arg.upcase!(*options) } + assert_nothing_raised { arg.downcase!(*options) } + assert_nothing_raised { arg.capitalize!(*options) } + assert_nothing_raised { arg.swapcase!(*options) } + end + + def assert_okay_both_types(*options) + assert_okay_functional_operations 'a', *options + assert_okay_bang_operations(+'a', *options) + assert_okay_functional_operations :a, *options + end + + def test_options_okay + assert_okay_both_types + assert_okay_both_types :ascii + assert_okay_both_types :turkic + assert_okay_both_types :lithuanian + assert_okay_both_types :turkic, :lithuanian + assert_okay_both_types :lithuanian, :turkic + end + + def test_operation_specific # :fold option only allowed on downcase + assert_nothing_raised { 'a'.downcase :fold } + assert_raise(ArgumentError) { 'a'.upcase :fold } + assert_raise(ArgumentError) { 'a'.capitalize :fold } + assert_raise(ArgumentError) { 'a'.swapcase :fold } + assert_nothing_raised { 'a'.dup.downcase! :fold } + assert_raise(ArgumentError) { 'a'.dup.upcase! :fold } + assert_raise(ArgumentError) { 'a'.dup.capitalize! :fold } + assert_raise(ArgumentError) { 'a'.dup.swapcase! :fold } + assert_nothing_raised { :a.downcase :fold } + assert_raise(ArgumentError) { :a.upcase :fold } + assert_raise(ArgumentError) { :a.capitalize :fold } + assert_raise(ArgumentError) { :a.swapcase :fold } + end +end diff --git a/test/ruby/enc/test_cesu8.rb b/test/ruby/enc/test_cesu8.rb new file mode 100644 index 0000000000..68a08389ea --- /dev/null +++ b/test/ruby/enc/test_cesu8.rb @@ -0,0 +1,113 @@ +# frozen_string_literal: false +require 'test/unit' + +class TestCESU8 < Test::Unit::TestCase + + def encdump(obj) + case obj + when String + obj.dump + when Regexp + "Regexp.new(#{encdump(obj.source)}, #{obj.options})" + else + raise Argument, "unexpected: #{obj.inspect}" + end + end + + def enccall(recv, meth, *args) + desc = '' + if String === recv + desc << encdump(recv) + else + desc << recv.inspect + end + desc << '.' << meth.to_s + if !args.empty? + desc << '(' + args.each_with_index {|a, i| + desc << ',' if 0 < i + if String === a + desc << encdump(a) + else + desc << a.inspect + end + } + desc << ')' + end + result = nil + assert_nothing_raised(desc) { + result = recv.send(meth, *args) + } + result + end + + def assert_str_equal(expected, actual, message=nil) + full_message = build_message(message, <<EOT) +#{encdump expected} expected but not equal to +#{encdump actual}. +EOT + assert_equal(expected, actual, full_message) + end + + # tests start + + def test_cesu8_valid_encoding + all_assertions do |a| + [ + "\x00", + "\x7f", + "\u0080", + "\u07ff", + "\u0800", + "\ud7ff", + "\xed\xa0\x80\xed\xb0\x80", + "\xed\xaf\xbf\xed\xbf\xbf", + "\ue000", + "\uffff", + ].each {|s| + s.force_encoding("cesu-8") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "\x80", + "\xc0\x80", + "\xc0", + "\xe0\x80\x80", + "\xed\xa0\x80", + "\xed\xb0\x80\xed\xb0\x80", + "\xe0", + "\xff", + ].each {|s| + s.force_encoding("cesu-8") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end + + def test_cesu8_ord + [ + ["\x00", 0], + ["\x7f", 0x7f], + ["\u0080", 0x80], + ["\u07ff", 0x7ff], + ["\u0800", 0x800], + ["\ud7ff", 0xd7ff], + ["\xed\xa0\x80\xed\xb0\x80", 0x10000], + ["\xed\xaf\xbf\xed\xbf\xbf", 0x10ffff], + ["\xee\x80\x80", 0xe000], + ["\xef\xbf\xbf", 0xffff], + ].each do |chr, ord| + chr.force_encoding("cesu-8") + assert_equal ord, chr.ord + assert_equal chr, ord.chr("cesu-8") + end + end + + def test_cesu8_left_adjust_char_head + assert_equal("", "\u{10000}".encode("cesu-8").chop) + end +end diff --git a/test/ruby/enc/test_cp949.rb b/test/ruby/enc/test_cp949.rb new file mode 100644 index 0000000000..0684162d5b --- /dev/null +++ b/test/ruby/enc/test_cp949.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: false +require "test/unit" + +class TestCP949 < Test::Unit::TestCase + def s(s) + s.force_encoding("cp949") + end + + def test_mbc_enc_len + assert_equal(1, s("\xa1\xa1").size) + end + + def test_mbc_to_code + assert_equal(0xa1a1, s("\xa1\xa1").ord) + end + + def test_code_to_mbc + assert_equal(s("\xa1\xa1"), 0xa1a1.chr("cp949")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\xa1\xa1)\\1"), "i") + assert_match(r, s("\xa1\xa1\xa1\xa1")) + end + + def test_left_adjust_char_head + assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop) + end +end diff --git a/test/ruby/enc/test_emoji.rb b/test/ruby/enc/test_emoji.rb new file mode 100644 index 0000000000..330ff70cb9 --- /dev/null +++ b/test/ruby/enc/test_emoji.rb @@ -0,0 +1,443 @@ +# frozen_string_literal: false +require 'test/unit' + +module Emoji + + class TestRenameSJIS < Test::Unit::TestCase + def test_shift_jis + assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-DoCoMo") } + assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-KDDI") } + assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-SoftBank") } + end + end + + class TestUTF8_BLACK_SUN_WITH_RAYS < Test::Unit::TestCase + include Emoji + + def setup + @codes = { + "UTF8-DoCoMo" => utf8_docomo("\u{E63E}"), + "UTF8-KDDI" => utf8_kddi("\u{E488}"), + "UTF8-SoftBank" => utf8_softbank("\u{E04A}"), + "UTF-8" => "\u{2600}", + } + end + + def test_convert + @codes.each do |from_enc, from_str| + @codes.each do |to_enc, to_str| + next if from_enc == to_enc + assert_equal to_str, from_str.encode(to_enc), "convert from #{from_enc} to #{to_enc}" + end + end + end + end + + class TestDoCoMo < Test::Unit::TestCase + include Emoji + + def setup + setup_instance_variable(self) + end + + def test_encoding_name + %w(UTF8-DoCoMo + SJIS-DoCoMo).each do |n| + assert_include Encoding.name_list, n, "encoding not found: #{n}" + end + end + + def test_comparison + assert_not_equal Encoding::UTF_8, Encoding::UTF8_DoCoMo + assert_not_equal Encoding::Windows_31J, Encoding::SJIS_DoCoMo + end + + def test_from_utf8 + assert_nothing_raised { assert_equal utf8_docomo(@aiueo_utf8), to_utf8_docomo(@aiueo_utf8) } + assert_nothing_raised { assert_equal sjis_docomo(@aiueo_sjis), to_sjis_docomo(@aiueo_utf8) } + end + + def test_from_sjis + assert_nothing_raised { assert_equal utf8_docomo(@aiueo_utf8), to_utf8_docomo(@aiueo_sjis) } + assert_nothing_raised { assert_equal sjis_docomo(@aiueo_sjis), to_sjis_docomo(@aiueo_sjis) } + end + + def test_to_utf8 + assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_docomo) } + assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_docomo) } + end + + def test_to_sjis + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_docomo) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_docomo) } + end + + def test_to_eucjp + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_docomo) } + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_docomo) } + end + + def test_docomo + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_docomo) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_docomo) } + end + + def test_to_kddi + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@utf8_docomo) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_docomo) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_docomo) } + + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_docomo) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_docomo) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_docomo) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@utf8_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@utf8_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@utf8_docomo_only) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@sjis_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@sjis_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@sjis_docomo_only) } + end + + def test_to_softbank + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_docomo) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_docomo) } + + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_docomo) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_docomo) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_softbank(@utf8_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_softbank(@utf8_docomo_only) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_softbank(@sjis_docomo_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_softbank(@sjis_docomo_only) } + end + end + + class TestKDDI < Test::Unit::TestCase + include Emoji + + def setup + setup_instance_variable(self) + end + + def test_encoding_name + %w(UTF8-KDDI + SJIS-KDDI + ISO-2022-JP-KDDI + stateless-ISO-2022-JP-KDDI).each do |n| + assert_include Encoding.name_list, n, "encoding not found: #{n}" + end + end + + def test_comparison + assert_not_equal Encoding::UTF_8, Encoding::UTF8_KDDI + assert_not_equal Encoding::Windows_31J, Encoding::SJIS_KDDI + assert_not_equal Encoding::ISO_2022_JP, Encoding::ISO_2022_JP_KDDI + assert_not_equal Encoding::Stateless_ISO_2022_JP, Encoding::Stateless_ISO_2022_JP_KDDI + end + + def test_from_utf8 + assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_utf8) } + assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_utf8) } + assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_utf8) } + end + + def test_from_sjis + assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_sjis) } + assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_sjis) } + assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_sjis) } + end + + def test_from_iso2022jp + assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_iso2022jp) } + assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_iso2022jp) } + assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_iso2022jp) } + end + + def test_to_utf8 + assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_kddi) } + assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_undoc_kddi) } + assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_kddi) } + assert_nothing_raised { assert_equal @utf8, to_utf8(@iso2022jp_kddi) } + end + + def test_to_sjis + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_undoc_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@iso2022jp_kddi) } + end + + def test_to_eucjp + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_undoc_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_kddi) } + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@iso2022jp_kddi) } + end + + def test_kddi + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_kddi) } + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@iso2022jp_kddi) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_kddi) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_undoc_kddi) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@iso2022jp_kddi) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_kddi) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_undoc_kddi) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@iso2022jp_kddi) } + end + + def test_to_docomo + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_kddi) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_kddi) } + + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_undoc_kddi) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_undoc_kddi) } + + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_kddi) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_kddi) } + + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@iso2022jp_kddi) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@iso2022jp_kddi) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_undoc_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_undoc_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@iso2022jp_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@iso2022jp_kddi_only) } + end + + def test_to_softbank + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_kddi) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_kddi) } + + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_undoc_kddi) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_undoc_kddi) } + + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_kddi) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_kddi) } + + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@iso2022jp_kddi) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@iso2022jp_kddi) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_undoc_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_undoc_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_kddi_only) } + + assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@iso2022jp_kddi_only) } + assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@iso2022jp_kddi_only) } + end + end + + class TestSoftBank < Test::Unit::TestCase + include Emoji + + def setup + setup_instance_variable(self) + end + + def test_encoding_name + %w(UTF8-SoftBank + SJIS-SoftBank).each do |n| + assert_include Encoding.name_list, n, "encoding not found: #{n}" + end + end + + def test_comparison + assert_not_equal Encoding::UTF_8, Encoding::UTF8_SoftBank + assert_not_equal Encoding::Windows_31J, Encoding::SJIS_SoftBank + end + + def test_from_utf8 + assert_nothing_raised { assert_equal utf8_softbank(@aiueo_utf8), to_utf8_softbank(@aiueo_utf8) } + assert_nothing_raised { assert_equal sjis_softbank(@aiueo_sjis), to_sjis_softbank(@aiueo_utf8) } + end + + def test_from_sjis + assert_nothing_raised { assert_equal utf8_softbank(@aiueo_utf8), to_utf8_softbank(@aiueo_sjis) } + assert_nothing_raised { assert_equal sjis_softbank(@aiueo_sjis), to_sjis_softbank(@aiueo_sjis) } + end + + def test_to_utf8 + assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_softbank) } + assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_softbank) } + end + + def test_to_sjis + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_softbank) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_softbank) } + end + + def test_to_eucjp + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_softbank) } + assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_softbank) } + end + + def test_softbank + assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_softbank) } + assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_softbank) } + end + + def test_to_docomo + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_softbank) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_softbank) } + + assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_softbank) } + assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_softbank) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_docomo(@utf8_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_docomo(@utf8_softbank_only) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_docomo(@sjis_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_docomo(@sjis_softbank_only) } + end + + def test_to_kddi + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@utf8_softbank) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_softbank) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_softbank) } + + assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_softbank) } + assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_softbank) } + assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_softbank) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@utf8_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@utf8_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@utf8_softbank_only) } + + assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@sjis_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@sjis_softbank_only) } + assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@sjis_softbank_only) } + end + end + + private + + def setup_instance_variable(obj) + obj.instance_eval do + @aiueo_utf8 = "\u{3042}\u{3044}\u{3046}\u{3048}\u{304A}" + @aiueo_sjis = to_sjis(@aiueo_utf8) + @aiueo_iso2022jp = to_iso2022jp(@aiueo_utf8) + + @utf8 = "\u{2600}" + + @utf8_docomo = utf8_docomo("\u{E63E}") + @sjis_docomo = sjis_docomo("\xF8\x9F") + @utf8_docomo_only = utf8_docomo("\u{E6B1}") + @sjis_docomo_only = sjis_docomo("\xF9\x55") + + @utf8_kddi = utf8_kddi("\u{E488}") + @utf8_undoc_kddi = utf8_kddi("\u{EF60}") + @sjis_kddi = sjis_kddi("\xF6\x60") + @iso2022jp_kddi = iso2022jp_kddi("\x1B$B\x75\x41\x1B(B") + @stateless_iso2022jp_kddi = stateless_iso2022jp_kddi("\x92\xF5\xC1") + @utf8_kddi_only = utf8_kddi("\u{E5B3}") + @utf8_undoc_kddi_only = utf8_kddi("\u{F0D0}") + @sjis_kddi_only = sjis_kddi("\xF7\xD0") + @iso2022jp_kddi_only = iso2022jp_kddi("\x1B$B\x78\x52\x1B(B") + @stateless_iso2022jp_kddi_only = stateless_iso2022jp_kddi("\x92\xF8\xD2") + + @utf8_softbank = utf8_softbank("\u{E04A}") + @sjis_softbank = sjis_softbank("\xF9\x8B") + @utf8_softbank_only = utf8_softbank("\u{E524}") + @sjis_softbank_only = sjis_softbank("\xFB\xC4") + end + end + + def utf8(str) + str.force_encoding("UTF-8") + end + + def to_utf8(str) + str.encode("UTF-8") + end + + def to_sjis(str) + str.encode("Windows-31J") + end + + def to_eucjp(str) + str.encode("eucJP-ms") + end + + def to_iso2022jp(str) + str.encode("ISO-2022-JP") + end + + def utf8_docomo(str) + str.force_encoding("UTF8-DoCoMo") + end + + def to_utf8_docomo(str) + str.encode("UTF8-DoCoMo") + end + + def utf8_kddi(str) + str.force_encoding("UTF8-KDDI") + end + + def to_utf8_kddi(str) + str.encode("UTF8-KDDI") + end + + def utf8_softbank(str) + str.force_encoding("UTF8-SoftBank") + end + + def to_utf8_softbank(str) + str.encode("UTF8-SoftBank") + end + + def sjis_docomo(str) + str.force_encoding("SJIS-DoCoMo") + end + + def to_sjis_docomo(str) + str.encode("SJIS-DoCoMo") + end + + def sjis_kddi(str) + str.force_encoding("SJIS-KDDI") + end + + def to_sjis_kddi(str) + str.encode("SJIS-KDDI") + end + + def sjis_softbank(str) + str.force_encoding("SJIS-SoftBank") + end + + def to_sjis_softbank(str) + str.encode("SJIS-SoftBank") + end + + def iso2022jp_kddi(str) + str.force_encoding("ISO-2022-JP-KDDI") + end + + def to_iso2022jp_kddi(str) + str.encode("ISO-2022-JP-KDDI") + end + + def stateless_iso2022jp_kddi(str) + str.force_encoding("stateless-ISO-2022-JP-KDDI") + end + + def to_stateless_iso2022jp_kddi(str) + str.encode("stateless-ISO-2022-JP-KDDI") + end + +end diff --git a/test/ruby/enc/test_emoji_breaks.rb b/test/ruby/enc/test_emoji_breaks.rb new file mode 100644 index 0000000000..0873e681c3 --- /dev/null +++ b/test/ruby/enc/test_emoji_breaks.rb @@ -0,0 +1,155 @@ +# frozen_string_literal: true +# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class TestEmojiBreaks < Test::Unit::TestCase + class BreakTest + attr_reader :string, :comment, :filename, :line_number, :type, :shortname + + def initialize(filename, line_number, data, comment='') + @filename = filename + @line_number = line_number + @comment = comment.gsub(/\s+/, ' ').strip + if filename=='emoji-test' or filename=='emoji-variation-sequences' + codes, @type = data.split(/\s*;\s*/) + @shortname = '' + else + codes, @type, @shortname = data.split(/\s*;\s*/) + end + @type = @type.gsub(/\s+/, ' ').strip + @shortname = @shortname.gsub(/\s+/, ' ').strip + @string = codes.split(/\s+/) + .map do |ch| + c = ch.to_i(16) + # eliminate cases with surrogates + # raise ArgumentError if 0xD800 <= c and c <= 0xDFFF + c.chr('UTF-8') + end.join + end + end + + class BreakFile + attr_reader :basename, :fullname, :version + FILES = [] + + def initialize(basename, path, version) + @basename = basename + @fullname = "#{path}/#{basename}.txt" # File.expand_path(path + version, __dir__) + @version = version + FILES << self + end + + def self.files + FILES + end + end + + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + UNICODE_DATA_PATH = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/ucd/emoji", __dir__) + EMOJI_VERSION = RbConfig::CONFIG['UNICODE_EMOJI_VERSION'] + EMOJI_DATA_PATH = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__) + + EMOJI_DATA_FILES = %w[emoji-sequences emoji-test emoji-zwj-sequences].map do |basename| + BreakFile.new(basename, EMOJI_DATA_PATH, EMOJI_VERSION) + end + UNICODE_DATA_FILE = BreakFile.new('emoji-variation-sequences', UNICODE_DATA_PATH, EMOJI_VERSION) + EMOJI_DATA_FILES << UNICODE_DATA_FILE + + def self.data_files_available? + EMOJI_DATA_FILES.all? do |f| + File.exist?(f.fullname) + end + end + + def test_data_files_available + assert_equal 4, EMOJI_DATA_FILES.size # debugging test + unless TestEmojiBreaks.data_files_available? + omit "Emoji data files not available in #{EMOJI_DATA_PATH}." + end + end + + if data_files_available? + def read_data + tests = [] + EMOJI_DATA_FILES.each do |file| + version_mismatch = true + file_tests = [] + File.foreach(file.fullname, encoding: Encoding::UTF_8) do |line| + line.chomp! + if $.==1 + if line=="# #{file.basename}-#{file.version}.txt" + version_mismatch = false + elsif line!="# #{file.basename}.txt" + raise "File Name Mismatch: line: #{line}, expected filename: #{file.basename}.txt" + end + end + version_mismatch = false if line =~ /^# Version: #{file.version}/ # 13.0 and older + version_mismatch = false if line =~ /^# Used with Emoji Version #{EMOJI_VERSION}/ # 14.0 and newer + next if line.match?(/\A(#|\z)/) + if line =~ /^(\h{4,6})\.\.(\h{4,6}) *(;.+)/ # deal with Unicode ranges in emoji-sequences.txt (Bug #18028) + range_start = $1.to_i(16) + range_end = $2.to_i(16) + rest = $3 + (range_start..range_end).each do |code_point| + file_tests << BreakTest.new(file.basename, $., *(code_point.to_s(16)+rest).split('#', 2)) + end + else + file_tests << BreakTest.new(file.basename, $., *line.split('#', 2)) + end + end + raise "File Version Mismatch: file: #{file.fullname}, version: #{file.version}" if version_mismatch + tests += file_tests + end + tests + end + + def all_tests + @@tests ||= read_data + rescue Errno::ENOENT + @@tests ||= [] + end + + def test_single_emoji + all_tests.each do |test| + expected = [test.string] + actual = test.string.each_grapheme_cluster.to_a + assert_equal expected, actual, + "file: #{test.filename}, line #{test.line_number}, " + + "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}" + end + end + + def test_embedded_emoji + all_tests.each do |test| + expected = ["\t", test.string, "\t"] + actual = "\t#{test.string}\t".each_grapheme_cluster.to_a + assert_equal expected, actual, + "file: #{test.filename}, line #{test.line_number}, " + + "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}" + end + end + + # test some pseodorandom combinations of emoji + def test_mixed_emoji + srand 0 + length = all_tests.length + step = 503 # use a prime number + all_tests.each do |test1| + start = rand step + start.step(by: step, to: length-1) do |t2| + test2 = all_tests[t2] + # exclude skin tones, because they glue to previous grapheme clusters + next if (0x1F3FB..0x1F3FF).include? test2.string.ord + expected = [test1.string, test2.string] + actual = (test1.string+test2.string).each_grapheme_cluster.to_a + assert_equal expected, actual, + "file1: #{test1.filename}, line1 #{test1.line_number}, " + + "file2: #{test2.filename}, line2 #{test2.line_number},\n" + + "type1: #{test1.type}, shortname1: #{test1.shortname}, comment1: #{test1.comment},\n" + + "type2: #{test2.type}, shortname2: #{test2.shortname}, comment2: #{test2.comment}" + end + end + end + end +end diff --git a/test/ruby/enc/test_euc_jp.rb b/test/ruby/enc/test_euc_jp.rb new file mode 100644 index 0000000000..4aec69e4db --- /dev/null +++ b/test/ruby/enc/test_euc_jp.rb @@ -0,0 +1,25 @@ +# vim: set fileencoding=euc-jp +# frozen_string_literal: false + +require "test/unit" + +class TestEUC_JP < Test::Unit::TestCase + def test_mbc_case_fold + assert_match(/()(a)\1\2/i, "aA") + assert_match(/()(a)\1\2/i, "aA") + end + + def test_property + assert_match(/{0}\p{Hiragana}{4}/, "Ҥ餬") + assert_no_match(/{0}\p{Hiragana}{4}/, "") + assert_no_match(/{0}\p{Hiragana}{4}/, "") + assert_no_match(/{0}\p{Katakana}{4}/, "Ҥ餬") + assert_match(/{0}\p{Katakana}{4}/, "") + assert_no_match(/{0}\p{Katakana}{4}/, "") + assert_raise(RegexpError) { Regexp.new('{0}\p{foobarbaz}') } + end + + def test_charboundary + assert_nil(/\xA2\xA2/ =~ "\xA1\xA2\xA2\xA3") + end +end diff --git a/test/ruby/enc/test_euc_kr.rb b/test/ruby/enc/test_euc_kr.rb new file mode 100644 index 0000000000..c9de2cc4e1 --- /dev/null +++ b/test/ruby/enc/test_euc_kr.rb @@ -0,0 +1,37 @@ +# frozen_string_literal: false +require "test/unit" + +class TestEucKr < Test::Unit::TestCase + def s(s) + s.force_encoding("euc-kr") + end + + def test_mbc_enc_len + assert_equal(1, s("\xa1\xa1").size) + end + + def test_mbc_to_code + assert_equal(0xa1a1, s("\xa1\xa1").ord) + end + + def test_code_to_mbc + assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-kr")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\xa1\xa1)\\1"), "i") + assert_match(r, s("\xa1\xa1\xa1\xa1")) + end + + def test_left_adjust_char_head + assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop) + end + + def test_euro_sign + assert_equal("\u{20ac}", s("\xa2\xe6").encode("utf-8")) + end + + def test_registered_mark + assert_equal("\u{00ae}", s("\xa2\xe7").encode("utf-8")) + end +end diff --git a/test/ruby/enc/test_euc_tw.rb b/test/ruby/enc/test_euc_tw.rb new file mode 100644 index 0000000000..649b1b81c6 --- /dev/null +++ b/test/ruby/enc/test_euc_tw.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: false +require "test/unit" + +class TestEucTw < Test::Unit::TestCase + def s(s) + s.force_encoding("euc-tw") + end + + def test_mbc_enc_len + assert_equal(1, s("\xa1\xa1").size) + end + + def test_mbc_to_code + assert_equal(0xa1a1, s("\xa1\xa1").ord) + end + + def test_code_to_mbc + assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-tw")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\xa1\xa1)\\1"), "i") + assert_match(r, s("\xa1\xa1\xa1\xa1")) + end + + def test_left_adjust_char_head + assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop) + end +end diff --git a/test/ruby/enc/test_gb18030.rb b/test/ruby/enc/test_gb18030.rb new file mode 100644 index 0000000000..76ac785951 --- /dev/null +++ b/test/ruby/enc/test_gb18030.rb @@ -0,0 +1,127 @@ +# frozen_string_literal: false +require "test/unit" + +class TestGB18030 < Test::Unit::TestCase + def s(s) + s.force_encoding("gb18030") + end + + def test_mbc_enc_len + assert_equal(1, s("\x81\x40").size) + assert_equal(1, s("\x81\x30\x81\x30").size) + end + + def test_mbc_to_code + assert_equal(0x8140, s("\x81\x40").ord) + end + + def test_code_to_mbc + assert_equal(s("\x81\x40"), 0x8140.chr("gb18030")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\x81\x40)\\1"), "i") + assert_match(r, s("\x81\x40\x81\x40")) + end + + def scheck(c, i) + assert_equal(s(c.reverse.take(c.size - i).join), s(c.reverse.join).chop) + end + + def fcheck(c) + c = s(c.reverse.join) + assert_raise(ArgumentError, c) { c.chop } + end + + def test_left_adjust_char_head + # C1: 00-2f, 3a-3f, 7f, ff + # C2: 40-7e, 80 + # C4: 30-39 + # CM: 81-fe + c1 = "\x2f" + c2 = "\x40" + c4 = "\x30" + cm = "\x81" + + # S_START-c1 + # S_START-c2-S_one_C2-0 + # S_START-c2-S_one_C2-c1 + # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-c1 + # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-c1 + # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-cm-S_odd_CM_one_CX(rec) + # S_START-c4-S_one_C4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c4-S_one_C4_odd_CMC4(rec) + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-cm-S_odd_CM_odd_CMC4(rec) + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-c1 + # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-cm-S_odd_CM_even_CMC4(rec) + # S_START-c4-S_one_C4-cm-S_one_CMC4-cm-S_even_CM_one_CX(rec) + # S_START-cm-S_one_CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c4-S_odd_C4CM(rec) + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-cm-S_even_CM_even_C4CM(rec) + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-c1 + # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-cm-S_even_CM_odd_C4CM(rec) + # S_START-cm-S_one_CM-cm-S_odd_CM_one_CX(rec) + + scheck([c1], 1) + scheck([c2], 1) + scheck([c2, c1], 1) + scheck([c2, cm, c1], 2) + scheck([c2, cm, cm, c1], 1) + scheck([c2, cm, cm, cm], 2) + scheck([c4], 1) + scheck([c4, c1], 1) + scheck([c4, cm], 2) + scheck([c4, cm, c1], 2) + scheck([c4, cm, c4, c1], 2) + scheck([c4, cm, c4, cm], 4) + scheck([c4, cm, c4, cm, c1], 4) + scheck([c4, cm, c4, cm, c4], 4) + scheck([c4, cm, c4, cm, c4, c1], 4) + scheck([c4, cm, c4, cm, c4, cm], 2) + scheck([c4, cm, c4, cm, c4, cm, c1], 2) + scheck([c4, cm, c4, cm, c4, cm, c4], 2) + scheck([c4, cm, c4, cm, c4, cm, cm, c1], 4) + scheck([c4, cm, c4, cm, c4, cm, cm, cm], 2) + scheck([c4, cm, c4, cm, c4, cm, cm, cm, c1], 2) + scheck([c4, cm, c4, cm, c4, cm, cm, cm, cm], 4) + scheck([c4, cm, c4, cm, cm, c1], 2) + scheck([c4, cm, c4, cm, cm, cm], 4) + scheck([c4, cm, c4, cm, cm, cm, c1], 4) + scheck([c4, cm, c4, cm, cm, cm, cm], 2) + scheck([c4, cm, cm], 1) + scheck([cm], 1) + scheck([cm, c1], 1) + scheck([cm, c4, c1], 1) + scheck([cm, c4, cm], 3) + scheck([cm, c4, cm, c1], 3) + scheck([cm, c4, cm, c4], 3) + scheck([cm, c4, cm, c4, c1], 3) + scheck([cm, c4, cm, c4, cm], 1) + scheck([cm, c4, cm, c4, cm, c1], 1) + scheck([cm, c4, cm, c4, cm, c4], 1) + scheck([cm, c4, cm, c4, cm, cm, c1], 3) + scheck([cm, c4, cm, c4, cm, cm, cm], 1) + scheck([cm, c4, cm, c4, cm, cm, cm, c1], 1) + scheck([cm, c4, cm, c4, cm, cm, cm, cm], 3) + scheck([cm, c4, cm, cm, c1], 1) + scheck([cm, c4, cm, cm, cm], 3) + scheck([cm, c4, cm, cm, cm, c1], 3) + scheck([cm, c4, cm, cm, cm, cm], 1) + scheck([cm, cm], 2) + end +end diff --git a/test/ruby/enc/test_gbk.rb b/test/ruby/enc/test_gbk.rb new file mode 100644 index 0000000000..2e541b5821 --- /dev/null +++ b/test/ruby/enc/test_gbk.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: false +require "test/unit" + +class TestGBK < Test::Unit::TestCase + def s(s) + s.force_encoding("gbk") + end + + def test_mbc_enc_len + assert_equal(1, s("\x81\x40").size) + end + + def test_mbc_to_code + assert_equal(0x8140, s("\x81\x40").ord) + end + + def test_code_to_mbc + assert_equal(s("\x81\x40"), 0x8140.chr("gbk")) + end + + def test_mbc_case_fold + r = Regexp.new(s("(\x81\x40)\\1"), "i") + assert_match(r, s("\x81\x40\x81\x40")) + end + + def test_left_adjust_char_head + assert_equal(s("\x81\x40"), s("\x81\x40\x81\x40").chop) + end +end diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb new file mode 100644 index 0000000000..7e6d722d40 --- /dev/null +++ b/test/ruby/enc/test_grapheme_breaks.rb @@ -0,0 +1,92 @@ +# frozen_string_literal: true +# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class TestGraphemeBreaksFromFile < Test::Unit::TestCase + class BreakTest + attr_reader :clusters, :string, :comment, :line_number + + def initialize(line_number, data, comment) + @line_number = line_number + @comment = comment + @clusters = data.sub(/\A\s*÷\s*/, '') + .sub(/\s*÷\s*\z/, '') + .split(/\s*÷\s*/) + .map do |cl| + cl.split(/\s*×\s*/) + .map do |ch| + c = ch.to_i(16) + # eliminate cases with surrogates + raise ArgumentError if 0xD800 <= c and c <= 0xDFFF + c.chr('UTF-8') + end.join + end + @string = @clusters.join + end + end + + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) + UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path + GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__) + + def self.file_available? + File.exist? GRAPHEME_BREAK_TEST_FILE + end + + def test_data_files_available + unless TestGraphemeBreaksFromFile.file_available? + omit "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}." + end + end + + if file_available? + def read_data + tests = [] + File.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line| + if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt") + raise "File Version Mismatch" + end + next if /\A#/.match? line + tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever' + end + tests + end + + def all_tests + @@tests ||= read_data + rescue Errno::ENOENT + @@tests ||= [] + end + + def test_each_grapheme_cluster + all_tests.each do |test| + expected = test.clusters + actual = test.string.each_grapheme_cluster.to_a + assert_equal expected, actual, + "line #{test.line_number}, expected '#{expected}', " + + "but got '#{actual}', comment: #{test.comment}" + end + end + + def test_backslash_X + all_tests.each do |test| + clusters = test.clusters.dup + string = test.string.dup + removals = 0 + while string.sub!(/\A\X/, '') + removals += 1 + clusters.shift + expected = clusters.join + assert_equal expected, string, + "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + assert_equal expected, string, + "line #{test.line_number}, after last removal, expected '#{expected}', " + + "but got '#{string}', comment: #{test.comment}" + end + end + end +end diff --git a/test/ruby/enc/test_iso_8859.rb b/test/ruby/enc/test_iso_8859.rb new file mode 100644 index 0000000000..ed663be243 --- /dev/null +++ b/test/ruby/enc/test_iso_8859.rb @@ -0,0 +1,166 @@ +# frozen_string_literal: false +require 'test/unit' + +class TestISO8859 < Test::Unit::TestCase + ASSERTS = %q( + assert_match(/^(\xdf)\1$/i, "\xdf\xdf") + assert_match(/^(\xdf)\1$/i, "ssss") + # assert_match(/^(\xdf)\1$/i, "\xdfss") # this must be bug... + assert_match(/^[\xdfz]+$/i, "sszzsszz") + assert_match(/^SS$/i, "\xdf") + assert_match(/^Ss$/i, "\xdf") + ((0xc0..0xde).to_a - [0xd7]).each do |c| + c1 = c.chr("ENCODING") + c2 = (c + 0x20).chr("ENCODING") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + assert_match(/^\xff$/i, "\xff") + ) + + def test_iso_8859_1 + eval("# encoding: iso8859-1\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-1")) + end + + def test_iso_8859_2 + eval("# encoding: iso8859-2\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-2")) + end + + def test_iso_8859_3 + # todo: decide on behavior, test, and fix implementation re. İ and ı (0xA9/0xB9) + # treating them as case equivalents is definitely an error + eval(%q(# encoding: iso8859-3 + assert_match(/^(\xdf)\1$/i, "\xdf\xdf") + assert_match(/^(\xdf)\1$/i, "ssss") + assert_match(/^[\xdfz]+$/i, "sszzsszz") + assert_match(/^SS$/i, "\xdf") + assert_match(/^Ss$/i, "\xdf") + [0xa1, 0xa6, *(0xaa..0xac), 0xaf].each do |c| + c1 = c.chr("iso8859-3") + c2 = (c + 0x10).chr("iso8859-3") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + ([*(0xc0..0xde)] - [0xc3, 0xd0, 0xd7]).each do |c| + c1 = c.chr("iso8859-3") + c2 = (c + 0x20).chr("iso8859-3") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + )) + end + + def test_iso_8859_4 + eval("# encoding: iso8859-4\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-4")) + end + + def test_iso_8859_5 + eval(%q(# encoding: iso8859-5 + (0xb0..0xcf).each do |c| + c1 = c.chr("iso8859-5") + c2 = (c + 0x20).chr("iso8859-5") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + ((0xa1..0xaf).to_a - [0xad]).each do |c| + c1 = c.chr("iso8859-5") + c2 = (c + 0x50).chr("iso8859-5") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + )) + end + + def test_iso_8859_6 + eval(%q(# encoding: iso8859-6 + [0xa4, 0xac, 0xbb, 0xbf, *(0xc1..0xda), *(0xe0..0xf2)].each do |c| + c1 = c.chr("iso8859-6") + assert_match(/^(#{ c1 })\1$/i, c1 * 2) + end + )) + end + + def test_iso_8859_7 + eval(%q(# encoding: iso8859-7 + ((0xa0..0xfe).to_a - [0xae, 0xd2]).each do |c| + c1 = c.chr("iso8859-7") + assert_match(/^(#{ c1 })\1$/i, c1 * 2) + end + ((0xc1..0xd9).to_a - [0xd2]).each do |c| + c1 = c.chr("iso8859-7") + c2 = (c + 0x20).chr("iso8859-7") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + )) + end + + def test_iso_8859_8 + eval(%q(# encoding: iso8859-8 + [0xa0, *(0xa2..0xbe), *(0xdf..0xfa), 0xfc, 0xfd].each do |c| + c1 = c.chr("iso8859-8") + assert_match(/^(#{ c1 })\1$/i, c1 * 2) + end + )) + end + + def test_iso_8859_9 + eval(%q(# encoding: iso8859-9 + assert_match(/^(\xdf)\1$/i, "\xdf\xdf") + assert_match(/^(\xdf)\1$/i, "ssss") + assert_match(/^[\xdfz]+$/i, "sszzsszz") + assert_match(/^SS$/i, "\xdf") + assert_match(/^Ss$/i, "\xdf") + ([*(0xc0..0xde)] - [0xd7, 0xdd]).each do |c| + c1 = c.chr("iso8859-9") + c2 = (c + 0x20).chr("iso8859-9") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + )) + end + + def test_iso_8859_10 + eval("# encoding: iso8859-10\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-10")) + end + + def test_iso_8859_11 + eval(%q(# encoding: iso8859-11 + [*(0xa0..0xda), *(0xdf..0xfb)].each do |c| + c1 = c.chr("iso8859-11") + assert_match(/^(#{ c1 })\1$/i, c1 * 2) + end + )) + end + + def test_iso_8859_13 + eval("# encoding: iso8859-13\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-13")) + end + + def test_iso_8859_14 + eval("# encoding: iso8859-14\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-14")) + end + + def test_iso_8859_15 + eval("# encoding: iso8859-15\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-15")) + end + + def test_iso_8859_16 + eval("# encoding: iso8859-16\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-16")) + end +end + diff --git a/test/ruby/enc/test_koi8.rb b/test/ruby/enc/test_koi8.rb new file mode 100644 index 0000000000..4a4d233e8d --- /dev/null +++ b/test/ruby/enc/test_koi8.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: false +require "test/unit" + +class TestKOI8 < Test::Unit::TestCase + ASSERTS = %q( + (0xc0..0xdf).each do |c| + c1 = c.chr("ENCODING") + c2 = (c + 0x20).chr("ENCODING") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + ) + + def test_koi8_r + eval("# encoding: koi8-r\n" + ASSERTS.gsub("ENCODING", "koi8-r")) + end + + def test_koi8_u + eval("# encoding: koi8-u\n" + ASSERTS.gsub("ENCODING", "koi8-u")) + end +end diff --git a/test/ruby/enc/test_regex_casefold.rb b/test/ruby/enc/test_regex_casefold.rb new file mode 100644 index 0000000000..b5d5c6e337 --- /dev/null +++ b/test/ruby/enc/test_regex_casefold.rb @@ -0,0 +1,120 @@ +# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp) + +require "test/unit" + +class TestCaseFold < Test::Unit::TestCase + + UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION'] + path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__) + UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path + CaseTest = Struct.new :source, :target, :kind, :line + + def check_downcase_properties(expected, start, *flags) + assert_equal expected, start.downcase(*flags) + temp = start.dup + assert_equal expected, temp.downcase!(*flags) + assert_equal expected, expected.downcase(*flags) + temp = expected + assert_nil temp.downcase!(*flags) + end + + def read_tests + File.readlines("#{UNICODE_DATA_PATH}/CaseFolding.txt", encoding: Encoding::ASCII_8BIT) + .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] } + .reject { |number, data| data =~ /^(#|$)/ } + .collect do |linenumber, linedata| + data, _ = linedata.split(/#\s*/) + code, kind, result, _ = data.split(/;\s*/) + CaseTest.new code.to_i(16).chr('UTF-8'), + result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'), + kind, linenumber + end.select { |test| test.kind=='C' } + end + + def to_codepoints(string) + string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') } + end + + def setup + @@tests ||= read_tests + rescue Errno::ENOENT => e + @@tests ||= [] + omit e.message + end + + def self.generate_test_casefold(encoding) + define_method "test_mbc_case_fold_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + assert_equal 5, "12345#{target}67890" =~ /#{source}/i, + "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " + + "(CaseFolding.txt line #{test[:line]})" + rescue Encoding::UndefinedConversionError + end + end + end + + define_method "test_get_case_fold_codes_by_str_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + assert_equal 5, "12345#{source}67890" =~ /#{target}/i, + "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " + + "(CaseFolding.txt line #{test[:line]}), " + + "error may also be triggered by mbc_case_fold" + rescue Encoding::UndefinedConversionError + end + end + end + + define_method "test_apply_all_case_fold_#{encoding}" do + @@tests.each do |test| + begin + source = test.source.encode encoding + target = test.target.encode encoding + reg = '\p{Upper}' + regexp = Regexp.compile reg.encode(encoding) + regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE + assert_equal 5, "12345#{target}67890" =~ regexpi, + "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " + + "(CaseFolding.txt line #{test[:line]})" + rescue Encoding::UndefinedConversionError + source = source + regexp = regexp + end + end + end + end + + def test_downcase_fold + @@tests.each do |test| + check_downcase_properties test.target, test.source, :fold + end + end + + # start with good encodings only + generate_test_casefold 'US-ASCII' + generate_test_casefold 'ISO-8859-1' + generate_test_casefold 'ISO-8859-2' + generate_test_casefold 'ISO-8859-3' + generate_test_casefold 'ISO-8859-4' + generate_test_casefold 'ISO-8859-5' + generate_test_casefold 'ISO-8859-6' + # generate_test_casefold 'ISO-8859-7' + generate_test_casefold 'ISO-8859-8' + generate_test_casefold 'ISO-8859-9' + generate_test_casefold 'ISO-8859-10' + generate_test_casefold 'ISO-8859-11' + generate_test_casefold 'ISO-8859-13' + generate_test_casefold 'ISO-8859-14' + generate_test_casefold 'ISO-8859-15' + generate_test_casefold 'ISO-8859-16' + generate_test_casefold 'Windows-1250' + # generate_test_casefold 'Windows-1251' + generate_test_casefold 'Windows-1252' + generate_test_casefold 'koi8-r' + generate_test_casefold 'koi8-u' +end diff --git a/test/ruby/enc/test_shift_jis.rb b/test/ruby/enc/test_shift_jis.rb new file mode 100644 index 0000000000..059992d167 --- /dev/null +++ b/test/ruby/enc/test_shift_jis.rb @@ -0,0 +1,28 @@ +# vim: set fileencoding=shift_jis +# frozen_string_literal: false + +require "test/unit" + +class TestShiftJIS < Test::Unit::TestCase + def test_mbc_case_fold + assert_match(/()(a)\1\2/i, "aA") + assert_match(/()(a)\1\2/i, "a`A") + end + + def test_property + assert_match(/{0}\p{Hiragana}{4}/, "Ђ炪") + assert_no_match(/{0}\p{Hiragana}{4}/, "J^Ji") + assert_no_match(/{0}\p{Hiragana}{4}/, "") + assert_no_match(/{0}\p{Katakana}{4}/, "Ђ炪") + assert_match(/{0}\p{Katakana}{4}/, "J^Ji") + assert_no_match(/{0}\p{Katakana}{4}/, "") + assert_raise(RegexpError) { Regexp.new('{0}\p{foobarbaz}') } + end + + def test_code_to_mbclen + s = "" + s << 0x82a9 + assert_equal("", s) + assert_raise(RangeError) { s << 0x82 } + end +end diff --git a/test/ruby/enc/test_utf16.rb b/test/ruby/enc/test_utf16.rb new file mode 100644 index 0000000000..e08f2ea14e --- /dev/null +++ b/test/ruby/enc/test_utf16.rb @@ -0,0 +1,397 @@ +# frozen_string_literal: false +require 'test/unit' + +class TestUTF16 < Test::Unit::TestCase + def encdump(obj) + case obj + when String + d = obj.dump + if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d + d + else + "#{d}.force_encoding(#{obj.encoding.name.dump})" + end + when Regexp + "Regexp.new(#{encdump(obj.source)}, #{obj.options})" + else + raise Argument, "unexpected: #{obj.inspect}" + end + end + + def enccall(recv, meth, *args) + desc = '' + if String === recv + desc << encdump(recv) + else + desc << recv.inspect + end + desc << '.' << meth.to_s + if !args.empty? + desc << '(' + args.each_with_index {|a, i| + desc << ',' if 0 < i + if String === a + desc << encdump(a) + else + desc << a.inspect + end + } + desc << ')' + end + result = nil + assert_nothing_raised(desc) { + result = recv.send(meth, *args) + } + result + end + + def assert_str_equal(expected, actual, message=nil) + full_message = build_message(message, <<EOT) +#{encdump expected} expected but not equal to +#{encdump actual}. +EOT + assert_equal(expected, actual, full_message) + end + + # tests start + + def test_utf16be_valid_encoding + all_assertions do |a| + [ + "\x00\x00", + "\xd7\xff", + "\xd8\x00\xdc\x00", + "\xdb\xff\xdf\xff", + "\xe0\x00", + "\xff\xff", + ].each {|s| + s.force_encoding("utf-16be") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "\x00", + "\xd7", + "\xd8\x00", + "\xd8\x00\xd8\x00", + "\xdc\x00", + "\xdc\x00\xd8\x00", + "\xdc\x00\xdc\x00", + "\xe0", + "\xff", + ].each {|s| + s.force_encoding("utf-16be") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end + + def test_utf16le_valid_encoding + all_assertions do |a| + [ + "\x00\x00", + "\xff\xd7", + "\x00\xd8\x00\xdc", + "\xff\xdb\xff\xdf", + "\x00\xe0", + "\xff\xff", + ].each {|s| + s.force_encoding("utf-16le") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "\x00", + "\xd7", + "\x00\xd8", + "\x00\xd8\x00\xd8", + "\x00\xdc", + "\x00\xdc\x00\xd8", + "\x00\xdc\x00\xdc", + "\xe0", + "\xff", + ].each {|s| + s.force_encoding("utf-16le") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end + + def test_strftime + s = "aa".force_encoding("utf-16be") + assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) } + end + + def test_intern + s = "aaaa".force_encoding("utf-16be") + assert_equal(s.encoding, s.intern.to_s.encoding, "#{encdump s}.intern.to_s.encoding") + end + + def test_sym_eq + s = "aa".force_encoding("utf-16le") + assert_not_equal(:aa, s.intern, "#{encdump s}.intern != :aa") + end + + def test_compatible + s1 = "aa".force_encoding("utf-16be") + s2 = "z".force_encoding("us-ascii") + assert_nil(Encoding.compatible?(s1, s2), "Encoding.compatible?(#{encdump s1}, #{encdump s2})") + end + + def test_casecmp + s1 = "aa".force_encoding("utf-16be") + s2 = "AA" + assert_not_equal(0, s1.casecmp(s2), "#{encdump s1}.casecmp(#{encdump s2})") + end + + def test_end_with + s1 = "ab".force_encoding("utf-16be") + s2 = "b".force_encoding("utf-16be") + assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})") + end + + def test_hex + assert_raise(Encoding::CompatibilityError) { + "ff".encode("utf-16le").hex + } + assert_raise(Encoding::CompatibilityError) { + "ff".encode("utf-16be").hex + } + end + + def test_oct + assert_raise(Encoding::CompatibilityError) { + "77".encode("utf-16le").oct + } + assert_raise(Encoding::CompatibilityError) { + "77".encode("utf-16be").oct + } + end + + def test_count + s1 = "aa".force_encoding("utf-16be") + s2 = "aa" + assert_raise(Encoding::CompatibilityError, "#{encdump s1}.count(#{encdump s2})") { + s1.count(s2) + } + end + + def test_plus + s1 = "a".force_encoding("us-ascii") + s2 = "aa".force_encoding("utf-16be") + assert_raise(Encoding::CompatibilityError, "#{encdump s1} + #{encdump s2}") { + s1 + s2 + } + end + + def test_encoding_find + assert_raise(ArgumentError) { + Encoding.find("utf-8".force_encoding("utf-16be")) + } + end + + def test_interpolation + s = "aa".force_encoding("utf-16be") + assert_raise(Encoding::CompatibilityError, "\"a\#{#{encdump s}}\"") { + "a#{s}" + } + end + + def test_slice! + enccall("aa".force_encoding("UTF-16BE"), :slice!, -1) + end + + def test_plus_empty1 + s1 = "" + s2 = "aa".force_encoding("utf-16be") + assert_nothing_raised("#{encdump s1} << #{encdump s2}") { + s1 + s2 + } + end + + def test_plus_empty2 + s1 = "aa" + s2 = "".force_encoding("utf-16be") + assert_nothing_raised("#{encdump s1} << #{encdump s2}") { + s1 + s2 + } + end + + def test_plus_nonempty + s1 = "aa" + s2 = "bb".force_encoding("utf-16be") + assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") { + s1 + s2 + } + end + + def test_concat_empty1 + s1 = "" + s2 = "aa".force_encoding("utf-16be") + assert_nothing_raised("#{encdump s1} << #{encdump s2}") { + s1 << s2 + } + end + + def test_concat_empty2 + s1 = "aa" + s2 = "".force_encoding("utf-16be") + assert_nothing_raised("#{encdump s1} << #{encdump s2}") { + s1 << s2 + } + end + + def test_concat_nonempty + s1 = "aa" + s2 = "bb".force_encoding("utf-16be") + assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") { + s1 << s2 + } + end + + def test_chomp + s = "\1\n".force_encoding("utf-16be") + assert_equal(s, s.chomp, "#{encdump s}.chomp") + s = "\0\n".force_encoding("utf-16be") + assert_equal("", s.chomp, "#{encdump s}.chomp") + s = "\0\r\0\n".force_encoding("utf-16be") + assert_equal("", s.chomp, "#{encdump s}.chomp") + end + + def test_succ + s = "\xff\xff".force_encoding("utf-16be") + assert_predicate(s.succ, :valid_encoding?, "#{encdump s}.succ.valid_encoding?") + + s = "\xdb\xff\xdf\xff".force_encoding("utf-16be") + assert_predicate(s.succ, :valid_encoding?, "#{encdump s}.succ.valid_encoding?") + end + + def test_regexp_union + enccall(Regexp, :union, "aa".force_encoding("utf-16be"), "bb".force_encoding("utf-16be")) + end + + def test_empty_regexp + s = "".force_encoding("utf-16be") + assert_equal(Encoding.find("utf-16be"), Regexp.new(s).encoding, + "Regexp.new(#{encdump s}).encoding") + end + + def test_regexp_match + assert_raise(Encoding::CompatibilityError) { Regexp.new("aa".force_encoding("utf-16be")) =~ "aa" } + end + + def test_gsub + s = "abcd".force_encoding("utf-16be") + assert_nothing_raised { + s.gsub(Regexp.new(".".encode("utf-16be")), "xy") + } + s = "ab\0\ncd".force_encoding("utf-16be") + assert_raise(Encoding::CompatibilityError) { + s.gsub(Regexp.new(".".encode("utf-16be")), "xy") + } + end + + def test_split_awk + s = " ab cd ".encode("utf-16be") + r = s.split(" ".encode("utf-16be")) + assert_equal(2, r.length) + assert_str_equal("ab".encode("utf-16be"), r[0]) + assert_str_equal("cd".encode("utf-16be"), r[1]) + end + + def test_count2 + e = "abc".count("^b") + assert_equal(e, "abc".encode("utf-16be").count("^b".encode("utf-16be"))) + assert_equal(e, "abc".encode("utf-16le").count("^b".encode("utf-16le"))) + end + + def test_header + assert_raise(ArgumentError) { eval("# encoding:utf-16le\nfoo") } + assert_raise(ArgumentError) { eval("# encoding:utf-16be\nfoo") } + end + + + def test_is_mbc_newline + sl = "f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n\0".force_encoding("utf-16le") + sb = "\0f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n".force_encoding("utf-16be") + al = sl.lines.to_a + ab = sb.lines.to_a + assert_equal("f\0o\0o\0\n\0".force_encoding("utf-16le"), al.shift) + assert_equal("b\0a\0r\0\n\0".force_encoding("utf-16le"), al.shift) + assert_equal("b\0a\0z\0\n\0".force_encoding("utf-16le"), al.shift) + assert_equal("\0f\0o\0o\0\n".force_encoding("utf-16be"), ab.shift) + assert_equal("\0b\0a\0r\0\n".force_encoding("utf-16be"), ab.shift) + assert_equal("\0b\0a\0z\0\n".force_encoding("utf-16be"), ab.shift) + + sl = "f\0o\0o\0\n\0".force_encoding("utf-16le") + sb = "\0f\0o\0o\0\n".force_encoding("utf-16be") + sl2 = "f\0o\0o\0".force_encoding("utf-16le") + sb2 = "\0f\0o\0o".force_encoding("utf-16be") + assert_equal(sl2, sl.chomp) + assert_equal(sl2, sl.chomp.chomp) + assert_equal(sb2, sb.chomp) + assert_equal(sb2, sb.chomp.chomp) + + sl = "f\0o\0o\0\n".force_encoding("utf-16le") + sb = "\0f\0o\0o\n".force_encoding("utf-16be") + assert_equal(sl, sl.chomp) + assert_equal(sb, sb.chomp) + end + + def test_code_to_mbc + assert_equal("a\0".force_encoding("utf-16le"), "a".ord.chr("utf-16le")) + assert_equal("\0a".force_encoding("utf-16be"), "a".ord.chr("utf-16be")) + end + + def utf8_to_utf16(s, e) + s.chars.map {|c| c.ord.chr(e) }.join + end + + def test_mbc_case_fold + rl = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16le"), "i") + rb = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16be"), "i") + assert_equal(Encoding.find("utf-16le"), rl.encoding) + assert_equal(Encoding.find("utf-16be"), rb.encoding) + assert_match(rl, utf8_to_utf16("\u3042a\u3042a", "utf-16le")) + assert_match(rb, utf8_to_utf16("\u3042a\u3042a", "utf-16be")) + end + + def test_surrogate_pair + sl = "\x42\xd8\xb7\xdf".force_encoding("utf-16le") + sb = "\xd8\x42\xdf\xb7".force_encoding("utf-16be") + + assert_equal(1, sl.size) + assert_equal(1, sb.size) + assert_equal(0x20bb7, sl.ord) + assert_equal(0x20bb7, sb.ord) + assert_equal(sl, 0x20bb7.chr("utf-16le")) + assert_equal(sb, 0x20bb7.chr("utf-16be")) + assert_equal("", sl.chop) + assert_equal("", sb.chop) + end + + def test_regexp_escape + s = "\0*".force_encoding("UTF-16BE") + r = Regexp.new(Regexp.escape(s)) + assert_match(r, s, "#{encdump(r)} =~ #{encdump(s)}") + end + + def test_casecmp2 + assert_equal(0, "\0A".force_encoding("UTF-16BE").casecmp("\0a".force_encoding("UTF-16BE"))) + assert_not_equal(0, "\0A".force_encoding("UTF-16LE").casecmp("\0a".force_encoding("UTF-16LE"))) + assert_not_equal(0, "A\0".force_encoding("UTF-16BE").casecmp("a\0".force_encoding("UTF-16BE"))) + assert_equal(0, "A\0".force_encoding("UTF-16LE").casecmp("a\0".force_encoding("UTF-16LE"))) + + ary = ["01".force_encoding("UTF-16LE"), + "10".force_encoding("UTF-16LE")] + e = ary.sort {|x,y| x <=> y } + a = ary.sort {|x,y| x.casecmp(y) } + assert_equal(e, a) + end +end diff --git a/test/ruby/enc/test_utf32.rb b/test/ruby/enc/test_utf32.rb new file mode 100644 index 0000000000..76379abca0 --- /dev/null +++ b/test/ruby/enc/test_utf32.rb @@ -0,0 +1,162 @@ +# frozen_string_literal: false +require 'test/unit' + +class TestUTF32 < Test::Unit::TestCase + def encdump(str) + d = str.dump + if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d + d + else + "#{d}.force_encoding(#{str.encoding.name.dump})" + end + end + + def assert_str_equal(expected, actual, message=nil) + full_message = build_message(message, <<EOT) +#{encdump expected} expected but not equal to +#{encdump actual}. +EOT + assert_equal(expected, actual, full_message) + end + + def test_substr + assert_str_equal( + "abcdefgh".force_encoding("utf-32le"), + "abcdefgh".force_encoding("utf-32le")[0,3]) + assert_str_equal( + "abcdefgh".force_encoding("utf-32be"), + "abcdefgh".force_encoding("utf-32be")[0,3]) + end + + def test_mbc_len + al = "abcdefghijkl".force_encoding("utf-32le").each_char.to_a + ab = "abcdefghijkl".force_encoding("utf-32be").each_char.to_a + assert_equal("abcd".force_encoding("utf-32le"), al.shift) + assert_equal("efgh".force_encoding("utf-32le"), al.shift) + assert_equal("ijkl".force_encoding("utf-32le"), al.shift) + assert_equal("abcd".force_encoding("utf-32be"), ab.shift) + assert_equal("efgh".force_encoding("utf-32be"), ab.shift) + assert_equal("ijkl".force_encoding("utf-32be"), ab.shift) + end + + def ascii_to_utf16le(s) + s.unpack("C*").map {|x| [x,0,0,0] }.flatten.pack("C*").force_encoding("utf-32le") + end + + def ascii_to_utf16be(s) + s.unpack("C*").map {|x| [0,0,0,x] }.flatten.pack("C*").force_encoding("utf-32be") + end + + def test_mbc_newline + al = ascii_to_utf16le("foo\nbar\nbaz\n").lines.to_a + ab = ascii_to_utf16be("foo\nbar\nbaz\n").lines.to_a + + assert_equal(ascii_to_utf16le("foo\n"), al.shift) + assert_equal(ascii_to_utf16le("bar\n"), al.shift) + assert_equal(ascii_to_utf16le("baz\n"), al.shift) + assert_equal(ascii_to_utf16be("foo\n"), ab.shift) + assert_equal(ascii_to_utf16be("bar\n"), ab.shift) + assert_equal(ascii_to_utf16be("baz\n"), ab.shift) + + sl = "a\0".force_encoding("utf-32le") + sb = "a\0".force_encoding("utf-32be") + assert_equal(sl, sl.chomp) + assert_equal(sb, sb.chomp) + end + + def test_mbc_to_code + sl = "a\0\0\0".force_encoding("utf-32le") + sb = "\0\0\0a".force_encoding("utf-32be") + assert_equal("a".ord, sl.ord) + assert_equal("a".ord, sb.ord) + end + + def utf8_to_utf32(s, e) + s.chars.map {|c| c.ord.chr(e) }.join + end + + def test_mbc_case_fold + rl = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32le"), "i") + rb = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32be"), "i") + assert_equal(Encoding.find("utf-32le"), rl.encoding) + assert_equal(Encoding.find("utf-32be"), rb.encoding) + assert_match(rl, utf8_to_utf32("\u3042a\u3042a", "utf-32le")) + assert_match(rb, utf8_to_utf32("\u3042a\u3042a", "utf-32be")) + end + + def test_code_to_mbc + sl = "a\0\0\0".force_encoding("utf-32le") + sb = "\0\0\0a".force_encoding("utf-32be") + assert_equal(sl, "a".ord.chr("utf-32le")) + assert_equal(sb, "a".ord.chr("utf-32be")) + end + + def test_utf32be_valid_encoding + all_assertions do |a| + [ + "\x00\x00\x00\x00", + "\x00\x00\x00a", + "\x00\x00\x30\x40", + "\x00\x00\xd7\xff", + "\x00\x00\xe0\x00", + "\x00\x00\xff\xff", + "\x00\x10\xff\xff", + ].each {|s| + s.force_encoding("utf-32be") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "a", + "\x00a", + "\x00\x00a", + "\x00\x00\xd8\x00", + "\x00\x00\xdb\xff", + "\x00\x00\xdc\x00", + "\x00\x00\xdf\xff", + "\x00\x11\x00\x00", + ].each {|s| + s.force_encoding("utf-32be") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end + + def test_utf32le_valid_encoding + all_assertions do |a| + [ + "\x00\x00\x00\x00", + "a\x00\x00\x00", + "\x40\x30\x00\x00", + "\xff\xd7\x00\x00", + "\x00\xe0\x00\x00", + "\xff\xff\x00\x00", + "\xff\xff\x10\x00", + ].each {|s| + s.force_encoding("utf-32le") + a.for(s) { + assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + [ + "a", + "a\x00", + "a\x00\x00", + "\x00\xd8\x00\x00", + "\xff\xdb\x00\x00", + "\x00\xdc\x00\x00", + "\xff\xdf\x00\x00", + "\x00\x00\x11\x00", + ].each {|s| + s.force_encoding("utf-32le") + a.for(s) { + assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?") + } + } + end + end +end + diff --git a/test/ruby/enc/test_windows_1251.rb b/test/ruby/enc/test_windows_1251.rb new file mode 100644 index 0000000000..002dbaa3cc --- /dev/null +++ b/test/ruby/enc/test_windows_1251.rb @@ -0,0 +1,17 @@ +# encoding:windows-1251 +# frozen_string_literal: false + +require "test/unit" + +class TestWindows1251 < Test::Unit::TestCase + def test_windows_1251 + (0xc0..0xdf).each do |c| + c1 = c.chr("windows-1251") + c2 = (c + 0x20).chr("windows-1251") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + end +end diff --git a/test/ruby/enc/test_windows_1252.rb b/test/ruby/enc/test_windows_1252.rb new file mode 100644 index 0000000000..f264cba759 --- /dev/null +++ b/test/ruby/enc/test_windows_1252.rb @@ -0,0 +1,26 @@ +# encoding:windows-1252 +# frozen_string_literal: false + +require "test/unit" + +class TestWindows1252 < Test::Unit::TestCase + def test_stset + assert_match(/^(\xdf)\1$/i, "\xdf\xdf") + assert_match(/^(\xdf)\1$/i, "ssss") + # assert_match(/^(\xdf)\1$/i, "\xdfss") # this must be bug... + assert_match(/^[\xdfz]+$/i, "sszzsszz") + assert_match(/^SS$/i, "\xdf") + assert_match(/^Ss$/i, "\xdf") + end + + def test_windows_1252 + [0x8a, 0x8c, 0x8e, *0xc0..0xd6, *0xd8..0xde, 0x9f].zip([0x9a, 0x9c, 0x9e, *0xe0..0xf6, *0xf8..0xfe, 0xff]).each do |c1, c2| + c1 = c1.chr("windows-1252") + c2 = c2.chr("windows-1252") + assert_match(/^(#{ c1 })\1$/i, c2 + c1) + assert_match(/^(#{ c2 })\1$/i, c1 + c2) + assert_match(/^[#{ c1 }]+$/i, c2 + c1) + assert_match(/^[#{ c2 }]+$/i, c1 + c2) + end + end +end |
