summaryrefslogtreecommitdiff
path: root/test/ruby/enc
diff options
context:
space:
mode:
Diffstat (limited to 'test/ruby/enc')
-rw-r--r--test/ruby/enc/test_big5.rb29
-rw-r--r--test/ruby/enc/test_case_comprehensive.rb306
-rw-r--r--test/ruby/enc/test_case_mapping.rb231
-rw-r--r--test/ruby/enc/test_case_options.rb81
-rw-r--r--test/ruby/enc/test_cesu8.rb113
-rw-r--r--test/ruby/enc/test_cp949.rb29
-rw-r--r--test/ruby/enc/test_emoji.rb443
-rw-r--r--test/ruby/enc/test_emoji_breaks.rb155
-rw-r--r--test/ruby/enc/test_euc_jp.rb25
-rw-r--r--test/ruby/enc/test_euc_kr.rb37
-rw-r--r--test/ruby/enc/test_euc_tw.rb29
-rw-r--r--test/ruby/enc/test_gb18030.rb127
-rw-r--r--test/ruby/enc/test_gbk.rb29
-rw-r--r--test/ruby/enc/test_grapheme_breaks.rb92
-rw-r--r--test/ruby/enc/test_iso_8859.rb166
-rw-r--r--test/ruby/enc/test_koi8.rb23
-rw-r--r--test/ruby/enc/test_regex_casefold.rb120
-rw-r--r--test/ruby/enc/test_shift_jis.rb28
-rw-r--r--test/ruby/enc/test_utf16.rb397
-rw-r--r--test/ruby/enc/test_utf32.rb162
-rw-r--r--test/ruby/enc/test_windows_1251.rb17
-rw-r--r--test/ruby/enc/test_windows_1252.rb26
22 files changed, 2665 insertions, 0 deletions
diff --git a/test/ruby/enc/test_big5.rb b/test/ruby/enc/test_big5.rb
new file mode 100644
index 0000000000..5dcf93e8e3
--- /dev/null
+++ b/test/ruby/enc/test_big5.rb
@@ -0,0 +1,29 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestBig5 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("big5")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("big5"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_case_comprehensive.rb b/test/ruby/enc/test_case_comprehensive.rb
new file mode 100644
index 0000000000..b812b88b83
--- /dev/null
+++ b/test/ruby/enc/test_case_comprehensive.rb
@@ -0,0 +1,306 @@
+# frozen_string_literal: true
+# Copyright © 2016 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class TestComprehensiveCaseMapping < Test::Unit::TestCase
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+ UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path
+
+ def self.hex2utf8(s)
+ s.split(' ').map { |c| c.to_i(16) }.pack('U*')
+ end
+
+ def self.expand_filename(basename)
+ File.expand_path("#{UNICODE_DATA_PATH}/#{basename}.txt", __dir__)
+ end
+
+ def self.data_files_available?
+ %w[UnicodeData CaseFolding SpecialCasing].all? do |f|
+ File.exist?(expand_filename(f))
+ end
+ end
+
+ def test_data_files_available
+ unless TestComprehensiveCaseMapping.data_files_available?
+ omit "Unicode data files not available in #{UNICODE_DATA_PATH}."
+ end
+ end
+end
+
+TestComprehensiveCaseMapping.data_files_available? and class TestComprehensiveCaseMapping
+ (CaseTest = Struct.new(:method_name, :attributes, :first_data, :follow_data)).class_eval do
+ def initialize(method_name, attributes, first_data, follow_data=first_data)
+ super
+ end
+ end
+
+ def self.read_data_file(filename)
+ File.foreach(expand_filename(filename), encoding: Encoding::ASCII_8BIT) do |line|
+ if $. == 1
+ if filename == 'UnicodeData'
+ elsif line.start_with?("# #{filename}-#{UNICODE_VERSION}.txt")
+ else
+ raise "File Version Mismatch"
+ end
+ end
+ next if /\A(?:[\#@]|\s*\z)|Surrogate/.match?(line)
+ data = line.chomp.split('#')[0].split(/;\s*/, 15)
+ code = data[0].to_i(16).chr(Encoding::UTF_8)
+ yield code, data
+ end
+ end
+
+ def self.read_data
+ @@codepoints = []
+
+ downcase = Hash.new { |h, c| c }
+ upcase = Hash.new { |h, c| c }
+ titlecase = Hash.new { |h, c| c }
+ casefold = Hash.new { |h, c| c }
+ swapcase = Hash.new { |h, c| c }
+ turkic_upcase = Hash.new { |h, c| upcase[c] }
+ turkic_downcase = Hash.new { |h, c| downcase[c] }
+ turkic_titlecase = Hash.new { |h, c| titlecase[c] }
+ turkic_swapcase = Hash.new { |h, c| swapcase[c] }
+ ascii_upcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? upcase[c] : c }
+ ascii_downcase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? downcase[c] : c }
+ ascii_titlecase = Hash.new { |h, c| /\A[a-zA-Z]\z/.match?(c) ? titlecase[c] : c }
+ ascii_swapcase = Hash.new { |h, c| /\A[a-z]\z/.match?(c) ? upcase[c] : (/\A[A-Z]\z/.match?(c) ? downcase[c] : c) }
+
+ read_data_file('UnicodeData') do |code, data|
+ @@codepoints << code
+ upcase[code] = hex2utf8 data[12] unless data[12].empty?
+ downcase[code] = hex2utf8 data[13] unless data[13].empty?
+ if code>="\u1C90" and code<="\u1CBF" # exception for Georgian: use lowercase for titlecase
+ titlecase[code] = hex2utf8(data[13]) unless data[13].empty?
+ else
+ titlecase[code] = hex2utf8 data[14] unless data[14].empty?
+ end
+ end
+ read_data_file('CaseFolding') do |code, data|
+ casefold[code] = hex2utf8(data[2]) if data[1] =~ /^[CF]$/
+ end
+
+ read_data_file('SpecialCasing') do |code, data|
+ case data[4]
+ when ''
+ upcase[code] = hex2utf8 data[3]
+ downcase[code] = hex2utf8 data[1]
+ titlecase[code] = hex2utf8 data[2]
+ when /\Atr\s*/
+ if data[4]!='tr After_I'
+ turkic_upcase[code] = hex2utf8 data[3]
+ turkic_downcase[code] = hex2utf8 data[1]
+ turkic_titlecase[code] = hex2utf8 data[2]
+ end
+ end
+ end
+
+ @@codepoints.each do |c|
+ if upcase[c] != c
+ if downcase[c] != c
+ swapcase[c] = turkic_swapcase[c] =
+ case c
+ when "\u01C5" then "\u0064\u017D"
+ when "\u01C8" then "\u006C\u004A"
+ when "\u01CB" then "\u006E\u004A"
+ when "\u01F2" then "\u0064\u005A"
+ else # Greek
+ downcase[upcase[c][0]] + "\u0399"
+ end
+ else
+ swapcase[c] = upcase[c]
+ turkic_swapcase[c] = turkic_upcase[c]
+ end
+ else
+ if downcase[c] != c
+ swapcase[c] = downcase[c]
+ turkic_swapcase[c] = turkic_downcase[c]
+ end
+ end
+ end
+
+ [
+ CaseTest.new(:downcase, [], downcase),
+ CaseTest.new(:upcase, [], upcase),
+ CaseTest.new(:capitalize, [], titlecase, downcase),
+ CaseTest.new(:swapcase, [], swapcase),
+ CaseTest.new(:downcase, [:fold], casefold),
+ CaseTest.new(:upcase, [:turkic], turkic_upcase),
+ CaseTest.new(:downcase, [:turkic], turkic_downcase),
+ CaseTest.new(:capitalize, [:turkic], turkic_titlecase, turkic_downcase),
+ CaseTest.new(:swapcase, [:turkic], turkic_swapcase),
+ CaseTest.new(:upcase, [:ascii], ascii_upcase),
+ CaseTest.new(:downcase, [:ascii], ascii_downcase),
+ CaseTest.new(:capitalize, [:ascii], ascii_titlecase, ascii_downcase),
+ CaseTest.new(:swapcase, [:ascii], ascii_swapcase),
+ ]
+ end
+
+ def self.all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def self.generate_unicode_case_mapping_tests(encoding)
+ all_tests.each do |test|
+ attributes = test.attributes.map(&:to_s).join '-'
+ attributes.prepend '_' unless attributes.empty?
+ define_method "test_#{encoding}_#{test.method_name}#{attributes}" do
+ @@codepoints.each do |code|
+ source = code.encode(encoding) * 5
+ target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding)
+ result = source.__send__(test.method_name, *test.attributes)
+ assert_equal target, result,
+ proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"}
+ end
+ end
+ end
+ end
+
+ def self.generate_single_byte_case_mapping_tests(encoding)
+ all_tests
+ # precalculate codepoints to speed up testing for small encodings
+ codepoints = []
+ (0..255).each do |cp|
+ begin
+ codepoints << cp.chr(encoding).encode('UTF-8')
+ rescue Encoding::UndefinedConversionError, RangeError
+ end
+ end
+ all_tests.each do |test|
+ attributes = test.attributes.map(&:to_s).join '-'
+ attributes.prepend '_' unless attributes.empty?
+ define_method "test_#{encoding}_#{test.method_name}#{attributes}" do
+ codepoints.each do |code|
+ begin
+ source = code.encode(encoding) * 5
+ begin
+ target = "#{test.first_data[code]}#{test.follow_data[code]*4}".encode(encoding)
+ rescue Encoding::UndefinedConversionError
+ if test.first_data[code]=="i\u0307" or test.follow_data[code]=="i\u0307" # explicit dot above
+ first_data = test.first_data[code]=="i\u0307" ? 'i' : test.first_data[code]
+ follow_data = test.follow_data[code]=="i\u0307" ? 'i' : test.follow_data[code]
+ target = "#{first_data}#{follow_data*4}".encode(encoding)
+ elsif code =~ /i|I/ # special case for Turkic
+ raise
+ else
+ target = source
+ end
+ end
+ result = source.send(test.method_name, *test.attributes)
+ assert_equal target, result,
+ proc{"from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"}
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+ end
+ end
+
+ # test for encodings that don't yet (or will never) deal with non-ASCII characters
+ def self.generate_ascii_only_case_mapping_tests(encoding)
+ all_tests
+ # preselect codepoints to speed up testing for small encodings
+ codepoints = @@codepoints.select do |code|
+ begin
+ code.encode(encoding)
+ true
+ rescue Encoding::UndefinedConversionError
+ false
+ end
+ end
+ define_method "test_#{encoding}_upcase" do
+ codepoints.each do |code|
+ begin
+ source = code.encode(encoding) * 5
+ target = source.tr 'a-z', 'A-Z'
+ result = source.upcase
+ assert_equal target, result,
+ "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+ define_method "test_#{encoding}_downcase" do
+ codepoints.each do |code|
+ begin
+ source = code.encode(encoding) * 5
+ target = source.tr 'A-Z', 'a-z'
+ result = source.downcase
+ assert_equal target, result,
+ "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+ define_method "test_#{encoding}_capitalize" do
+ codepoints.each do |code|
+ begin
+ source = code.encode(encoding) * 5
+ target = source[0].tr('a-z', 'A-Z') + source[1..-1].tr('A-Z', 'a-z')
+ result = source.capitalize
+ assert_equal target, result,
+ "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+ define_method "test_#{encoding}_swapcase" do
+ codepoints.each do |code|
+ begin
+ source = code.encode(encoding) * 5
+ target = source.tr('a-zA-Z', 'A-Za-z')
+ result = source.swapcase
+ assert_equal target, result,
+ "from #{code*5} (#{source.dump}) expected #{target.dump} but was #{result.dump}"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+ end
+
+ generate_single_byte_case_mapping_tests 'US-ASCII'
+ generate_single_byte_case_mapping_tests 'ASCII-8BIT'
+ generate_single_byte_case_mapping_tests 'ISO-8859-1'
+ generate_single_byte_case_mapping_tests 'ISO-8859-2'
+ generate_single_byte_case_mapping_tests 'ISO-8859-3'
+ generate_single_byte_case_mapping_tests 'ISO-8859-4'
+ generate_single_byte_case_mapping_tests 'ISO-8859-5'
+ generate_single_byte_case_mapping_tests 'ISO-8859-6'
+ generate_single_byte_case_mapping_tests 'ISO-8859-7'
+ generate_single_byte_case_mapping_tests 'ISO-8859-8'
+ generate_single_byte_case_mapping_tests 'ISO-8859-9'
+ generate_single_byte_case_mapping_tests 'ISO-8859-10'
+ generate_single_byte_case_mapping_tests 'ISO-8859-11'
+ generate_single_byte_case_mapping_tests 'ISO-8859-13'
+ generate_single_byte_case_mapping_tests 'ISO-8859-14'
+ generate_single_byte_case_mapping_tests 'ISO-8859-15'
+ generate_single_byte_case_mapping_tests 'ISO-8859-16'
+ generate_ascii_only_case_mapping_tests 'KOI8-R'
+ generate_ascii_only_case_mapping_tests 'KOI8-U'
+ generate_ascii_only_case_mapping_tests 'Big5'
+ generate_ascii_only_case_mapping_tests 'EUC-JP'
+ generate_ascii_only_case_mapping_tests 'EUC-KR'
+ generate_ascii_only_case_mapping_tests 'GB18030'
+ generate_ascii_only_case_mapping_tests 'GB2312'
+ generate_ascii_only_case_mapping_tests 'GBK'
+ generate_ascii_only_case_mapping_tests 'Shift_JIS'
+ generate_ascii_only_case_mapping_tests 'Windows-31J'
+ generate_single_byte_case_mapping_tests 'Windows-1250'
+ generate_single_byte_case_mapping_tests 'Windows-1251'
+ generate_single_byte_case_mapping_tests 'Windows-1252'
+ generate_single_byte_case_mapping_tests 'Windows-1253'
+ generate_single_byte_case_mapping_tests 'Windows-1254'
+ generate_single_byte_case_mapping_tests 'Windows-1255'
+ generate_ascii_only_case_mapping_tests 'Windows-1256'
+ generate_single_byte_case_mapping_tests 'Windows-1257'
+ generate_unicode_case_mapping_tests 'UTF-8'
+ generate_unicode_case_mapping_tests 'UTF-16BE'
+ generate_unicode_case_mapping_tests 'UTF-16LE'
+ generate_unicode_case_mapping_tests 'UTF-32BE'
+ generate_unicode_case_mapping_tests 'UTF-32LE'
+end
diff --git a/test/ruby/enc/test_case_mapping.rb b/test/ruby/enc/test_case_mapping.rb
new file mode 100644
index 0000000000..a7d1ed0d16
--- /dev/null
+++ b/test/ruby/enc/test_case_mapping.rb
@@ -0,0 +1,231 @@
+# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+# preliminary tests, using as a guard
+# to test new implementation strategy
+class TestCaseMappingPreliminary < Test::Unit::TestCase
+ # checks, including idempotence and non-modification; not always guaranteed
+ def check_upcase_properties(expected, start, *flags)
+ assert_equal expected, start.upcase(*flags)
+ temp = start.dup
+ assert_equal expected, temp.upcase!(*flags) unless expected==temp
+ assert_equal nil, temp.upcase!(*flags) if expected==temp
+ assert_equal expected, expected.upcase(*flags)
+ temp = expected.dup
+ assert_nil temp.upcase!(*flags)
+ end
+
+ def check_downcase_properties(expected, start, *flags)
+ assert_equal expected, start.downcase(*flags)
+ temp = start.dup
+ assert_equal expected, temp.downcase!(*flags) unless expected==temp
+ assert_equal nil, temp.downcase!(*flags) if expected==temp
+ assert_equal expected, expected.downcase(*flags)
+ temp = expected.dup
+ assert_nil temp.downcase!(*flags)
+ end
+
+ def check_capitalize_properties(expected, start, *flags)
+ assert_equal expected, start.capitalize(*flags)
+ temp = start.dup
+ assert_equal expected, temp.capitalize!(*flags) unless expected==temp
+ assert_equal nil, temp.capitalize!(*flags) if expected==temp
+ assert_equal expected, expected.capitalize(*flags)
+ temp = expected.dup
+ assert_nil temp.capitalize!(*flags)
+ end
+
+ def check_capitalize_suffixes(lower, upper)
+ while upper.length > 1
+ lower = lower[1..-1]
+ check_capitalize_properties upper[0]+lower, upper
+ upper = upper[1..-1]
+ end
+ end
+
+ # different properties; careful: roundtrip isn't always guaranteed
+ def check_swapcase_properties(expected, start, *flags)
+ assert_equal expected, start.swapcase(*flags)
+ temp = +start
+ assert_equal expected, temp.swapcase!(*flags)
+ assert_equal start, start.swapcase(*flags).swapcase(*flags)
+ assert_equal expected, expected.swapcase(*flags).swapcase(*flags)
+ end
+
+ def test_ascii
+ check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)'
+ check_upcase_properties 'YUKIHIRO MATSUMOTO (MATZ)', 'yukihiro matsumoto (matz)'
+ check_capitalize_properties 'Yukihiro matsumoto (matz)', 'yukihiro MATSUMOTO (MATZ)'
+ check_swapcase_properties 'yUKIHIRO matsumoto (MAtz)', 'Yukihiro MATSUMOTO (maTZ)'
+ end
+
+ def test_invalid
+ assert_raise(ArgumentError, "Should not be possible to upcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').upcase }
+ assert_raise(ArgumentError, "Should not be possible to downcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').downcase }
+ assert_raise(ArgumentError, "Should not be possible to capitalize invalid string.") { "\xEB".dup.force_encoding('UTF-8').capitalize }
+ assert_raise(ArgumentError, "Should not be possible to swapcase invalid string.") { "\xEB".dup.force_encoding('UTF-8').swapcase }
+ end
+
+ def test_general
+ check_downcase_properties 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ'
+ check_upcase_properties 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ', 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ'
+ check_capitalize_suffixes 'résumé dürst ĭñŧėřŋãţijňőńæłĩżàťïōņ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤÏŌŅ'
+ check_swapcase_properties 'résumé DÜRST ĭñŧėřŊÃŢIJŇŐŃæłĩżàťïōņ', 'RÉSUMÉ dürst ĬÑŦĖŘŋãţijňőńÆŁĨŻÀŤÏŌŅ'
+ end
+
+ def test_one_way_upcase
+ check_upcase_properties 'ΜΜΜΜΜ', 'µµµµµ' # MICRO SIGN -> Greek Mu
+ check_downcase_properties 'µµµµµ', 'µµµµµ' # MICRO SIGN -> Greek Mu
+ check_capitalize_properties 'Μµµµµ', 'µµµµµ' # MICRO SIGN -> Greek Mu
+ check_capitalize_properties 'Μµµµµ', 'µµµµµ', :turkic # MICRO SIGN -> Greek Mu
+ check_capitalize_properties 'H̱ẖẖẖẖ', 'ẖẖẖẖẖ'
+ check_capitalize_properties 'Βϐϐϐϐ', 'ϐϐϐϐϐ'
+ check_capitalize_properties 'Θϑϑϑϑ', 'ϑϑϑϑϑ'
+ check_capitalize_properties 'Φϕ', 'ϕϕ'
+ check_capitalize_properties 'Πϖ', 'ϖϖ'
+ check_capitalize_properties 'Κϰ', 'ϰϰ'
+ check_capitalize_properties 'Ρϱϱ', 'ϱϱϱ'
+ check_capitalize_properties 'Εϵ', 'ϵϵ'
+ check_capitalize_properties 'Ιͅͅͅͅ', 'ͅͅͅͅͅ'
+ check_capitalize_properties 'Sſſſſ', 'ſſſſſ'
+ end
+
+ def test_various
+ check_upcase_properties 'Μ', 'µ' # MICRO SIGN -> Greek Mu
+ check_downcase_properties 'µµµµµ', 'µµµµµ' # MICRO SIGN
+ check_capitalize_properties 'Ss', 'ß'
+ check_upcase_properties 'SS', 'ß'
+ end
+
+ def test_cherokee
+ check_downcase_properties "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ'
+ check_upcase_properties 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79"
+ check_capitalize_suffixes "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79", 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ'
+ assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ'.downcase(:fold)
+ assert_equal 'ᎠᎡᎢᎣᎤᎥᎦᎧᎨᎩ', "\uab70\uab71\uab72\uab73\uab74\uab75\uab76\uab77\uab78\uab79".downcase(:fold)
+ end
+
+ def test_titlecase
+ check_downcase_properties 'dz dž lj nj', 'Dz Dž Lj Nj'
+ check_downcase_properties 'dz dž lj nj', 'DZ DŽ LJ NJ'
+ check_upcase_properties 'DZ DŽ LJ NJ', 'Dz Dž Lj Nj'
+ check_upcase_properties 'DZ DŽ LJ NJ', 'dz dž lj nj'
+ check_capitalize_properties 'Dz', 'DZ'
+ check_capitalize_properties 'Dž', 'DŽ'
+ check_capitalize_properties 'Lj', 'LJ'
+ check_capitalize_properties 'Nj', 'NJ'
+ check_capitalize_properties 'Dz', 'dz'
+ check_capitalize_properties 'Dž', 'dž'
+ check_capitalize_properties 'Lj', 'lj'
+ check_capitalize_properties 'Nj', 'nj'
+ end
+
+ def test_swapcase
+ assert_equal 'dZ', 'Dz'.swapcase
+ assert_equal 'dŽ', 'Dž'.swapcase
+ assert_equal 'lJ', 'Lj'.swapcase
+ assert_equal 'nJ', 'Nj'.swapcase
+ assert_equal 'ἀΙ', 'ᾈ'.swapcase
+ assert_equal 'ἣΙ', 'ᾛ'.swapcase
+ assert_equal 'ὧΙ', 'ᾯ'.swapcase
+ assert_equal 'αΙ', 'ᾼ'.swapcase
+ assert_equal 'ηΙ', 'ῌ'.swapcase
+ assert_equal 'ωΙ', 'ῼ'.swapcase
+ end
+
+ def test_ascii_option
+ check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :ascii
+ check_upcase_properties 'YUKIHIRO MATSUMOTO (MATZ)', 'yukihiro matsumoto (matz)', :ascii
+ check_capitalize_properties 'Yukihiro matsumoto (matz)', 'yukihiro MATSUMOTO (MATZ)', :ascii
+ check_swapcase_properties 'yUKIHIRO matsumoto (MAtz)', 'Yukihiro MATSUMOTO (maTZ)', :ascii
+ check_downcase_properties 'yukİhİro matsumoto (matz)', 'YUKİHİRO MATSUMOTO (MATZ)', :ascii
+ check_downcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii
+ check_swapcase_properties 'rÉsumÉ dÜrst ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', 'RÉSUMÉ DÜRST ĬÑŦĖŘŊÃŢIJŇŐŃÆŁĨŻÀŤĬŌŅ', :ascii
+ end
+
+ def test_fold_option
+ check_downcase_properties 'ss', 'ß', :fold
+ check_downcase_properties 'fifl', 'fifl', :fold
+ check_downcase_properties 'σ', 'ς', :fold
+ check_downcase_properties 'μ', 'µ', :fold # MICRO SIGN -> Greek mu
+ end
+
+ def test_turcic
+ check_downcase_properties 'yukihiro matsumoto (matz)', 'Yukihiro MATSUMOTO (MATZ)', :turkic
+ check_upcase_properties 'YUKİHİRO MATSUMOTO (MATZ)', 'Yukihiro Matsumoto (matz)', :turkic
+ check_downcase_properties "yuki\u0307hi\u0307ro matsumoto (matz)", 'YUKİHİRO MATSUMOTO (MATZ)'
+ end
+
+ def test_greek
+ check_downcase_properties 'αβγδεζηθικλμνξοπρστυφχψω', 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ'
+ check_upcase_properties 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ', 'αβγδεζηθικλμνξοπρστυφχψω'
+ end
+
+ # This test checks against problems when changing the order of mapping results
+ # in some of the entries of the unfolding table (related to
+ # https://bugs.ruby-lang.org/issues/12990).
+ def test_reorder_unfold
+ # GREEK SMALL LETTER IOTA
+ assert_equal 0, "\u03B9" =~ /\u0345/i
+ assert_equal 0, "\u0345" =~ /\u03B9/i
+ assert_equal 0, "\u03B9" =~ /\u0399/i
+ assert_equal 0, "\u0399" =~ /\u03B9/i
+ assert_equal 0, "\u03B9" =~ /\u1fbe/i
+ assert_equal 0, "\u1fbe" =~ /\u03B9/i
+
+ # GREEK SMALL LETTER MU
+ assert_equal 0, "\u03BC" =~ /\u00B5/i
+ assert_equal 0, "\u00B5" =~ /\u03BC/i
+ assert_equal 0, "\u03BC" =~ /\u039C/i
+ assert_equal 0, "\u039C" =~ /\u03BC/i
+
+ # CYRILLIC SMALL LETTER MONOGRAPH UK
+ assert_equal 0, "\uA64B" =~ /\u1c88/i
+ assert_equal 0, "\u1c88" =~ /\uA64B/i
+ assert_equal 0, "\uA64B" =~ /\ua64A/i
+ assert_equal 0, "\ua64A" =~ /\uA64B/i
+ end
+
+ def test_georgian_canary
+ message = "Reexamine implementation of Georgian in String#capitalize"
+ assert_equal false, "\u1CBB".match?(/\p{assigned}/), message
+ assert_equal false, "\u1CBC".match?(/\p{assigned}/), message
+ end
+
+ def test_georgian_unassigned
+ message = "Unassigned codepoints should not be converted"
+ assert_equal "\u1CBB", "\u1CBB".capitalize, message
+ assert_equal "\u1CBC", "\u1CBC".capitalize, message
+ end
+
+ def test_georgian_capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u1C91\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u1C90\u10D1\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u1C91\u10D2".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u1C92".capitalize
+ assert_equal "\u10D0\u10D1\u10D2", "\u10D0\u10D1\u10D2".capitalize
+ end
+
+ def test_shift_jis_downcase_ascii
+ s = ("A".."Z").map {|c| "\x89#{c}"}.join("").force_encoding("Shift_JIS")
+ assert_equal s, s.downcase(:ascii)
+ end
+
+ def test_shift_jis_upcase_ascii
+ s = ("a".."z").map {|c| "\x89#{c}"}.join("").force_encoding("Shift_JIS")
+ assert_equal s, s.upcase(:ascii)
+ end
+
+ def no_longer_a_test_buffer_allocations
+ assert_equal 'TURKISH*ı'*10, ('I'*10).downcase(:turkic)
+ assert_equal 'TURKISH*ı'*100, ('I'*100).downcase(:turkic)
+ assert_equal 'TURKISH*ı'*1_000, ('I'*1_000).downcase(:turkic)
+ assert_equal 'TURKISH*ı'*10_000, ('I'*10_000).downcase(:turkic)
+ assert_equal 'TURKISH*ı'*100_000, ('I'*100_000).downcase(:turkic)
+ assert_equal 'TURKISH*ı'*1_000_000, ('I'*1_000_000).downcase(:turkic)
+ end
+end
diff --git a/test/ruby/enc/test_case_options.rb b/test/ruby/enc/test_case_options.rb
new file mode 100644
index 0000000000..e9c81d804e
--- /dev/null
+++ b/test/ruby/enc/test_case_options.rb
@@ -0,0 +1,81 @@
+# Copyright © 2016 Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class TestCaseOptions < Test::Unit::TestCase
+ def assert_raise_functional_operations(arg, *options)
+ assert_raise(ArgumentError) { arg.upcase(*options) }
+ assert_raise(ArgumentError) { arg.downcase(*options) }
+ assert_raise(ArgumentError) { arg.capitalize(*options) }
+ assert_raise(ArgumentError) { arg.swapcase(*options) }
+ end
+
+ def assert_raise_bang_operations(arg, *options)
+ assert_raise(ArgumentError) { arg.upcase!(*options) }
+ assert_raise(ArgumentError) { arg.downcase!(*options) }
+ assert_raise(ArgumentError) { arg.capitalize!(*options) }
+ assert_raise(ArgumentError) { arg.swapcase!(*options) }
+ end
+
+ def assert_raise_both_types(*options)
+ assert_raise_functional_operations 'a', *options
+ assert_raise_bang_operations(+'a', *options)
+ assert_raise_functional_operations :a, *options
+ end
+
+ def test_option_errors
+ assert_raise_both_types :invalid
+ assert_raise_both_types :lithuanian, :turkic, :fold
+ assert_raise_both_types :fold, :fold
+ assert_raise_both_types :ascii, :fold
+ assert_raise_both_types :fold, :ascii
+ assert_raise_both_types :ascii, :turkic
+ assert_raise_both_types :turkic, :ascii
+ assert_raise_both_types :ascii, :lithuanian
+ assert_raise_both_types :lithuanian, :ascii
+ end
+
+ def assert_okay_functional_operations(arg, *options)
+ assert_nothing_raised { arg.upcase(*options) }
+ assert_nothing_raised { arg.downcase(*options) }
+ assert_nothing_raised { arg.capitalize(*options) }
+ assert_nothing_raised { arg.swapcase(*options) }
+ end
+
+ def assert_okay_bang_operations(arg, *options)
+ assert_nothing_raised { arg.upcase!(*options) }
+ assert_nothing_raised { arg.downcase!(*options) }
+ assert_nothing_raised { arg.capitalize!(*options) }
+ assert_nothing_raised { arg.swapcase!(*options) }
+ end
+
+ def assert_okay_both_types(*options)
+ assert_okay_functional_operations 'a', *options
+ assert_okay_bang_operations(+'a', *options)
+ assert_okay_functional_operations :a, *options
+ end
+
+ def test_options_okay
+ assert_okay_both_types
+ assert_okay_both_types :ascii
+ assert_okay_both_types :turkic
+ assert_okay_both_types :lithuanian
+ assert_okay_both_types :turkic, :lithuanian
+ assert_okay_both_types :lithuanian, :turkic
+ end
+
+ def test_operation_specific # :fold option only allowed on downcase
+ assert_nothing_raised { 'a'.downcase :fold }
+ assert_raise(ArgumentError) { 'a'.upcase :fold }
+ assert_raise(ArgumentError) { 'a'.capitalize :fold }
+ assert_raise(ArgumentError) { 'a'.swapcase :fold }
+ assert_nothing_raised { 'a'.dup.downcase! :fold }
+ assert_raise(ArgumentError) { 'a'.dup.upcase! :fold }
+ assert_raise(ArgumentError) { 'a'.dup.capitalize! :fold }
+ assert_raise(ArgumentError) { 'a'.dup.swapcase! :fold }
+ assert_nothing_raised { :a.downcase :fold }
+ assert_raise(ArgumentError) { :a.upcase :fold }
+ assert_raise(ArgumentError) { :a.capitalize :fold }
+ assert_raise(ArgumentError) { :a.swapcase :fold }
+ end
+end
diff --git a/test/ruby/enc/test_cesu8.rb b/test/ruby/enc/test_cesu8.rb
new file mode 100644
index 0000000000..68a08389ea
--- /dev/null
+++ b/test/ruby/enc/test_cesu8.rb
@@ -0,0 +1,113 @@
+# frozen_string_literal: false
+require 'test/unit'
+
+class TestCESU8 < Test::Unit::TestCase
+
+ def encdump(obj)
+ case obj
+ when String
+ obj.dump
+ when Regexp
+ "Regexp.new(#{encdump(obj.source)}, #{obj.options})"
+ else
+ raise Argument, "unexpected: #{obj.inspect}"
+ end
+ end
+
+ def enccall(recv, meth, *args)
+ desc = ''
+ if String === recv
+ desc << encdump(recv)
+ else
+ desc << recv.inspect
+ end
+ desc << '.' << meth.to_s
+ if !args.empty?
+ desc << '('
+ args.each_with_index {|a, i|
+ desc << ',' if 0 < i
+ if String === a
+ desc << encdump(a)
+ else
+ desc << a.inspect
+ end
+ }
+ desc << ')'
+ end
+ result = nil
+ assert_nothing_raised(desc) {
+ result = recv.send(meth, *args)
+ }
+ result
+ end
+
+ def assert_str_equal(expected, actual, message=nil)
+ full_message = build_message(message, <<EOT)
+#{encdump expected} expected but not equal to
+#{encdump actual}.
+EOT
+ assert_equal(expected, actual, full_message)
+ end
+
+ # tests start
+
+ def test_cesu8_valid_encoding
+ all_assertions do |a|
+ [
+ "\x00",
+ "\x7f",
+ "\u0080",
+ "\u07ff",
+ "\u0800",
+ "\ud7ff",
+ "\xed\xa0\x80\xed\xb0\x80",
+ "\xed\xaf\xbf\xed\xbf\xbf",
+ "\ue000",
+ "\uffff",
+ ].each {|s|
+ s.force_encoding("cesu-8")
+ a.for(s) {
+ assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ [
+ "\x80",
+ "\xc0\x80",
+ "\xc0",
+ "\xe0\x80\x80",
+ "\xed\xa0\x80",
+ "\xed\xb0\x80\xed\xb0\x80",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("cesu-8")
+ a.for(s) {
+ assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ end
+ end
+
+ def test_cesu8_ord
+ [
+ ["\x00", 0],
+ ["\x7f", 0x7f],
+ ["\u0080", 0x80],
+ ["\u07ff", 0x7ff],
+ ["\u0800", 0x800],
+ ["\ud7ff", 0xd7ff],
+ ["\xed\xa0\x80\xed\xb0\x80", 0x10000],
+ ["\xed\xaf\xbf\xed\xbf\xbf", 0x10ffff],
+ ["\xee\x80\x80", 0xe000],
+ ["\xef\xbf\xbf", 0xffff],
+ ].each do |chr, ord|
+ chr.force_encoding("cesu-8")
+ assert_equal ord, chr.ord
+ assert_equal chr, ord.chr("cesu-8")
+ end
+ end
+
+ def test_cesu8_left_adjust_char_head
+ assert_equal("", "\u{10000}".encode("cesu-8").chop)
+ end
+end
diff --git a/test/ruby/enc/test_cp949.rb b/test/ruby/enc/test_cp949.rb
new file mode 100644
index 0000000000..0684162d5b
--- /dev/null
+++ b/test/ruby/enc/test_cp949.rb
@@ -0,0 +1,29 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestCP949 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("cp949")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("cp949"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_emoji.rb b/test/ruby/enc/test_emoji.rb
new file mode 100644
index 0000000000..330ff70cb9
--- /dev/null
+++ b/test/ruby/enc/test_emoji.rb
@@ -0,0 +1,443 @@
+# frozen_string_literal: false
+require 'test/unit'
+
+module Emoji
+
+ class TestRenameSJIS < Test::Unit::TestCase
+ def test_shift_jis
+ assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-DoCoMo") }
+ assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-KDDI") }
+ assert_raise(ArgumentError) { "".force_encoding("Shift_JIS-SoftBank") }
+ end
+ end
+
+ class TestUTF8_BLACK_SUN_WITH_RAYS < Test::Unit::TestCase
+ include Emoji
+
+ def setup
+ @codes = {
+ "UTF8-DoCoMo" => utf8_docomo("\u{E63E}"),
+ "UTF8-KDDI" => utf8_kddi("\u{E488}"),
+ "UTF8-SoftBank" => utf8_softbank("\u{E04A}"),
+ "UTF-8" => "\u{2600}",
+ }
+ end
+
+ def test_convert
+ @codes.each do |from_enc, from_str|
+ @codes.each do |to_enc, to_str|
+ next if from_enc == to_enc
+ assert_equal to_str, from_str.encode(to_enc), "convert from #{from_enc} to #{to_enc}"
+ end
+ end
+ end
+ end
+
+ class TestDoCoMo < Test::Unit::TestCase
+ include Emoji
+
+ def setup
+ setup_instance_variable(self)
+ end
+
+ def test_encoding_name
+ %w(UTF8-DoCoMo
+ SJIS-DoCoMo).each do |n|
+ assert_include Encoding.name_list, n, "encoding not found: #{n}"
+ end
+ end
+
+ def test_comparison
+ assert_not_equal Encoding::UTF_8, Encoding::UTF8_DoCoMo
+ assert_not_equal Encoding::Windows_31J, Encoding::SJIS_DoCoMo
+ end
+
+ def test_from_utf8
+ assert_nothing_raised { assert_equal utf8_docomo(@aiueo_utf8), to_utf8_docomo(@aiueo_utf8) }
+ assert_nothing_raised { assert_equal sjis_docomo(@aiueo_sjis), to_sjis_docomo(@aiueo_utf8) }
+ end
+
+ def test_from_sjis
+ assert_nothing_raised { assert_equal utf8_docomo(@aiueo_utf8), to_utf8_docomo(@aiueo_sjis) }
+ assert_nothing_raised { assert_equal sjis_docomo(@aiueo_sjis), to_sjis_docomo(@aiueo_sjis) }
+ end
+
+ def test_to_utf8
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_docomo) }
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_docomo) }
+ end
+
+ def test_to_sjis
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_docomo) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_docomo) }
+ end
+
+ def test_to_eucjp
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_docomo) }
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_docomo) }
+ end
+
+ def test_docomo
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_docomo) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_docomo) }
+ end
+
+ def test_to_kddi
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@utf8_docomo) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_docomo) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_docomo) }
+
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_docomo) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_docomo) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_docomo) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@utf8_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@utf8_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@utf8_docomo_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@sjis_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@sjis_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@sjis_docomo_only) }
+ end
+
+ def test_to_softbank
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_docomo) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_docomo) }
+
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_docomo) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_docomo) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_softbank(@utf8_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_softbank(@utf8_docomo_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_softbank(@sjis_docomo_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_softbank(@sjis_docomo_only) }
+ end
+ end
+
+ class TestKDDI < Test::Unit::TestCase
+ include Emoji
+
+ def setup
+ setup_instance_variable(self)
+ end
+
+ def test_encoding_name
+ %w(UTF8-KDDI
+ SJIS-KDDI
+ ISO-2022-JP-KDDI
+ stateless-ISO-2022-JP-KDDI).each do |n|
+ assert_include Encoding.name_list, n, "encoding not found: #{n}"
+ end
+ end
+
+ def test_comparison
+ assert_not_equal Encoding::UTF_8, Encoding::UTF8_KDDI
+ assert_not_equal Encoding::Windows_31J, Encoding::SJIS_KDDI
+ assert_not_equal Encoding::ISO_2022_JP, Encoding::ISO_2022_JP_KDDI
+ assert_not_equal Encoding::Stateless_ISO_2022_JP, Encoding::Stateless_ISO_2022_JP_KDDI
+ end
+
+ def test_from_utf8
+ assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_utf8) }
+ assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_utf8) }
+ assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_utf8) }
+ end
+
+ def test_from_sjis
+ assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_sjis) }
+ assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_sjis) }
+ assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_sjis) }
+ end
+
+ def test_from_iso2022jp
+ assert_nothing_raised { assert_equal utf8_kddi(@aiueo_utf8), to_utf8_kddi(@aiueo_iso2022jp) }
+ assert_nothing_raised { assert_equal sjis_kddi(@aiueo_sjis), to_sjis_kddi(@aiueo_iso2022jp) }
+ assert_nothing_raised { assert_equal iso2022jp_kddi(@aiueo_iso2022jp), to_iso2022jp_kddi(@aiueo_iso2022jp) }
+ end
+
+ def test_to_utf8
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_kddi) }
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_undoc_kddi) }
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@iso2022jp_kddi) }
+ end
+
+ def test_to_sjis
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_undoc_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@iso2022jp_kddi) }
+ end
+
+ def test_to_eucjp
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_undoc_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_kddi) }
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@iso2022jp_kddi) }
+ end
+
+ def test_kddi
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@iso2022jp_kddi) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_undoc_kddi) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@iso2022jp_kddi) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_undoc_kddi) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@iso2022jp_kddi) }
+ end
+
+ def test_to_docomo
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_kddi) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_undoc_kddi) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_undoc_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@iso2022jp_kddi) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@iso2022jp_kddi) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_undoc_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_undoc_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_docomo, to_utf8_docomo(@iso2022jp_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_docomo, to_sjis_docomo(@iso2022jp_kddi_only) }
+ end
+
+ def test_to_softbank
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_kddi) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_undoc_kddi) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_undoc_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_kddi) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_kddi) }
+
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@iso2022jp_kddi) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@iso2022jp_kddi) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@utf8_undoc_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_undoc_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@sjis_kddi_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @utf8_softbank, to_utf8_softbank(@iso2022jp_kddi_only) }
+ assert_raise(Encoding::UndefinedConversionError) { assert_equal @sjis_softbank, to_sjis_softbank(@iso2022jp_kddi_only) }
+ end
+ end
+
+ class TestSoftBank < Test::Unit::TestCase
+ include Emoji
+
+ def setup
+ setup_instance_variable(self)
+ end
+
+ def test_encoding_name
+ %w(UTF8-SoftBank
+ SJIS-SoftBank).each do |n|
+ assert_include Encoding.name_list, n, "encoding not found: #{n}"
+ end
+ end
+
+ def test_comparison
+ assert_not_equal Encoding::UTF_8, Encoding::UTF8_SoftBank
+ assert_not_equal Encoding::Windows_31J, Encoding::SJIS_SoftBank
+ end
+
+ def test_from_utf8
+ assert_nothing_raised { assert_equal utf8_softbank(@aiueo_utf8), to_utf8_softbank(@aiueo_utf8) }
+ assert_nothing_raised { assert_equal sjis_softbank(@aiueo_sjis), to_sjis_softbank(@aiueo_utf8) }
+ end
+
+ def test_from_sjis
+ assert_nothing_raised { assert_equal utf8_softbank(@aiueo_utf8), to_utf8_softbank(@aiueo_sjis) }
+ assert_nothing_raised { assert_equal sjis_softbank(@aiueo_sjis), to_sjis_softbank(@aiueo_sjis) }
+ end
+
+ def test_to_utf8
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@utf8_softbank) }
+ assert_nothing_raised { assert_equal @utf8, to_utf8(@sjis_softbank) }
+ end
+
+ def test_to_sjis
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@utf8_softbank) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis(@sjis_softbank) }
+ end
+
+ def test_to_eucjp
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@utf8_softbank) }
+ assert_raise(Encoding::UndefinedConversionError) { to_eucjp(@sjis_softbank) }
+ end
+
+ def test_softbank
+ assert_nothing_raised { assert_equal @utf8_softbank, to_utf8_softbank(@sjis_softbank) }
+ assert_nothing_raised { assert_equal @sjis_softbank, to_sjis_softbank(@utf8_softbank) }
+ end
+
+ def test_to_docomo
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@utf8_softbank) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@utf8_softbank) }
+
+ assert_nothing_raised { assert_equal @utf8_docomo, to_utf8_docomo(@sjis_softbank) }
+ assert_nothing_raised { assert_equal @sjis_docomo, to_sjis_docomo(@sjis_softbank) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_docomo(@utf8_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_docomo(@utf8_softbank_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_docomo(@sjis_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_docomo(@sjis_softbank_only) }
+ end
+
+ def test_to_kddi
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@utf8_softbank) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@utf8_softbank) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@utf8_softbank) }
+
+ assert_nothing_raised { assert_equal @utf8_kddi, to_utf8_kddi(@sjis_softbank) }
+ assert_nothing_raised { assert_equal @sjis_kddi, to_sjis_kddi(@sjis_softbank) }
+ assert_nothing_raised { assert_equal @iso2022jp_kddi, to_iso2022jp_kddi(@sjis_softbank) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@utf8_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@utf8_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@utf8_softbank_only) }
+
+ assert_raise(Encoding::UndefinedConversionError) { to_utf8_kddi(@sjis_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_sjis_kddi(@sjis_softbank_only) }
+ assert_raise(Encoding::UndefinedConversionError) { to_iso2022jp_kddi(@sjis_softbank_only) }
+ end
+ end
+
+ private
+
+ def setup_instance_variable(obj)
+ obj.instance_eval do
+ @aiueo_utf8 = "\u{3042}\u{3044}\u{3046}\u{3048}\u{304A}"
+ @aiueo_sjis = to_sjis(@aiueo_utf8)
+ @aiueo_iso2022jp = to_iso2022jp(@aiueo_utf8)
+
+ @utf8 = "\u{2600}"
+
+ @utf8_docomo = utf8_docomo("\u{E63E}")
+ @sjis_docomo = sjis_docomo("\xF8\x9F")
+ @utf8_docomo_only = utf8_docomo("\u{E6B1}")
+ @sjis_docomo_only = sjis_docomo("\xF9\x55")
+
+ @utf8_kddi = utf8_kddi("\u{E488}")
+ @utf8_undoc_kddi = utf8_kddi("\u{EF60}")
+ @sjis_kddi = sjis_kddi("\xF6\x60")
+ @iso2022jp_kddi = iso2022jp_kddi("\x1B$B\x75\x41\x1B(B")
+ @stateless_iso2022jp_kddi = stateless_iso2022jp_kddi("\x92\xF5\xC1")
+ @utf8_kddi_only = utf8_kddi("\u{E5B3}")
+ @utf8_undoc_kddi_only = utf8_kddi("\u{F0D0}")
+ @sjis_kddi_only = sjis_kddi("\xF7\xD0")
+ @iso2022jp_kddi_only = iso2022jp_kddi("\x1B$B\x78\x52\x1B(B")
+ @stateless_iso2022jp_kddi_only = stateless_iso2022jp_kddi("\x92\xF8\xD2")
+
+ @utf8_softbank = utf8_softbank("\u{E04A}")
+ @sjis_softbank = sjis_softbank("\xF9\x8B")
+ @utf8_softbank_only = utf8_softbank("\u{E524}")
+ @sjis_softbank_only = sjis_softbank("\xFB\xC4")
+ end
+ end
+
+ def utf8(str)
+ str.force_encoding("UTF-8")
+ end
+
+ def to_utf8(str)
+ str.encode("UTF-8")
+ end
+
+ def to_sjis(str)
+ str.encode("Windows-31J")
+ end
+
+ def to_eucjp(str)
+ str.encode("eucJP-ms")
+ end
+
+ def to_iso2022jp(str)
+ str.encode("ISO-2022-JP")
+ end
+
+ def utf8_docomo(str)
+ str.force_encoding("UTF8-DoCoMo")
+ end
+
+ def to_utf8_docomo(str)
+ str.encode("UTF8-DoCoMo")
+ end
+
+ def utf8_kddi(str)
+ str.force_encoding("UTF8-KDDI")
+ end
+
+ def to_utf8_kddi(str)
+ str.encode("UTF8-KDDI")
+ end
+
+ def utf8_softbank(str)
+ str.force_encoding("UTF8-SoftBank")
+ end
+
+ def to_utf8_softbank(str)
+ str.encode("UTF8-SoftBank")
+ end
+
+ def sjis_docomo(str)
+ str.force_encoding("SJIS-DoCoMo")
+ end
+
+ def to_sjis_docomo(str)
+ str.encode("SJIS-DoCoMo")
+ end
+
+ def sjis_kddi(str)
+ str.force_encoding("SJIS-KDDI")
+ end
+
+ def to_sjis_kddi(str)
+ str.encode("SJIS-KDDI")
+ end
+
+ def sjis_softbank(str)
+ str.force_encoding("SJIS-SoftBank")
+ end
+
+ def to_sjis_softbank(str)
+ str.encode("SJIS-SoftBank")
+ end
+
+ def iso2022jp_kddi(str)
+ str.force_encoding("ISO-2022-JP-KDDI")
+ end
+
+ def to_iso2022jp_kddi(str)
+ str.encode("ISO-2022-JP-KDDI")
+ end
+
+ def stateless_iso2022jp_kddi(str)
+ str.force_encoding("stateless-ISO-2022-JP-KDDI")
+ end
+
+ def to_stateless_iso2022jp_kddi(str)
+ str.encode("stateless-ISO-2022-JP-KDDI")
+ end
+
+end
diff --git a/test/ruby/enc/test_emoji_breaks.rb b/test/ruby/enc/test_emoji_breaks.rb
new file mode 100644
index 0000000000..0873e681c3
--- /dev/null
+++ b/test/ruby/enc/test_emoji_breaks.rb
@@ -0,0 +1,155 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class TestEmojiBreaks < Test::Unit::TestCase
+ class BreakTest
+ attr_reader :string, :comment, :filename, :line_number, :type, :shortname
+
+ def initialize(filename, line_number, data, comment='')
+ @filename = filename
+ @line_number = line_number
+ @comment = comment.gsub(/\s+/, ' ').strip
+ if filename=='emoji-test' or filename=='emoji-variation-sequences'
+ codes, @type = data.split(/\s*;\s*/)
+ @shortname = ''
+ else
+ codes, @type, @shortname = data.split(/\s*;\s*/)
+ end
+ @type = @type.gsub(/\s+/, ' ').strip
+ @shortname = @shortname.gsub(/\s+/, ' ').strip
+ @string = codes.split(/\s+/)
+ .map do |ch|
+ c = ch.to_i(16)
+ # eliminate cases with surrogates
+ # raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+ c.chr('UTF-8')
+ end.join
+ end
+ end
+
+ class BreakFile
+ attr_reader :basename, :fullname, :version
+ FILES = []
+
+ def initialize(basename, path, version)
+ @basename = basename
+ @fullname = "#{path}/#{basename}.txt" # File.expand_path(path + version, __dir__)
+ @version = version
+ FILES << self
+ end
+
+ def self.files
+ FILES
+ end
+ end
+
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ UNICODE_DATA_PATH = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}/ucd/emoji", __dir__)
+ EMOJI_VERSION = RbConfig::CONFIG['UNICODE_EMOJI_VERSION']
+ EMOJI_DATA_PATH = File.expand_path("../../../enc/unicode/data/emoji/#{EMOJI_VERSION}", __dir__)
+
+ EMOJI_DATA_FILES = %w[emoji-sequences emoji-test emoji-zwj-sequences].map do |basename|
+ BreakFile.new(basename, EMOJI_DATA_PATH, EMOJI_VERSION)
+ end
+ UNICODE_DATA_FILE = BreakFile.new('emoji-variation-sequences', UNICODE_DATA_PATH, EMOJI_VERSION)
+ EMOJI_DATA_FILES << UNICODE_DATA_FILE
+
+ def self.data_files_available?
+ EMOJI_DATA_FILES.all? do |f|
+ File.exist?(f.fullname)
+ end
+ end
+
+ def test_data_files_available
+ assert_equal 4, EMOJI_DATA_FILES.size # debugging test
+ unless TestEmojiBreaks.data_files_available?
+ omit "Emoji data files not available in #{EMOJI_DATA_PATH}."
+ end
+ end
+
+ if data_files_available?
+ def read_data
+ tests = []
+ EMOJI_DATA_FILES.each do |file|
+ version_mismatch = true
+ file_tests = []
+ File.foreach(file.fullname, encoding: Encoding::UTF_8) do |line|
+ line.chomp!
+ if $.==1
+ if line=="# #{file.basename}-#{file.version}.txt"
+ version_mismatch = false
+ elsif line!="# #{file.basename}.txt"
+ raise "File Name Mismatch: line: #{line}, expected filename: #{file.basename}.txt"
+ end
+ end
+ version_mismatch = false if line =~ /^# Version: #{file.version}/ # 13.0 and older
+ version_mismatch = false if line =~ /^# Used with Emoji Version #{EMOJI_VERSION}/ # 14.0 and newer
+ next if line.match?(/\A(#|\z)/)
+ if line =~ /^(\h{4,6})\.\.(\h{4,6}) *(;.+)/ # deal with Unicode ranges in emoji-sequences.txt (Bug #18028)
+ range_start = $1.to_i(16)
+ range_end = $2.to_i(16)
+ rest = $3
+ (range_start..range_end).each do |code_point|
+ file_tests << BreakTest.new(file.basename, $., *(code_point.to_s(16)+rest).split('#', 2))
+ end
+ else
+ file_tests << BreakTest.new(file.basename, $., *line.split('#', 2))
+ end
+ end
+ raise "File Version Mismatch: file: #{file.fullname}, version: #{file.version}" if version_mismatch
+ tests += file_tests
+ end
+ tests
+ end
+
+ def all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def test_single_emoji
+ all_tests.each do |test|
+ expected = [test.string]
+ actual = test.string.each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file: #{test.filename}, line #{test.line_number}, " +
+ "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
+ end
+ end
+
+ def test_embedded_emoji
+ all_tests.each do |test|
+ expected = ["\t", test.string, "\t"]
+ actual = "\t#{test.string}\t".each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file: #{test.filename}, line #{test.line_number}, " +
+ "type: #{test.type}, shortname: #{test.shortname}, comment: #{test.comment}"
+ end
+ end
+
+ # test some pseodorandom combinations of emoji
+ def test_mixed_emoji
+ srand 0
+ length = all_tests.length
+ step = 503 # use a prime number
+ all_tests.each do |test1|
+ start = rand step
+ start.step(by: step, to: length-1) do |t2|
+ test2 = all_tests[t2]
+ # exclude skin tones, because they glue to previous grapheme clusters
+ next if (0x1F3FB..0x1F3FF).include? test2.string.ord
+ expected = [test1.string, test2.string]
+ actual = (test1.string+test2.string).each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "file1: #{test1.filename}, line1 #{test1.line_number}, " +
+ "file2: #{test2.filename}, line2 #{test2.line_number},\n" +
+ "type1: #{test1.type}, shortname1: #{test1.shortname}, comment1: #{test1.comment},\n" +
+ "type2: #{test2.type}, shortname2: #{test2.shortname}, comment2: #{test2.comment}"
+ end
+ end
+ end
+ end
+end
diff --git a/test/ruby/enc/test_euc_jp.rb b/test/ruby/enc/test_euc_jp.rb
new file mode 100644
index 0000000000..4aec69e4db
--- /dev/null
+++ b/test/ruby/enc/test_euc_jp.rb
@@ -0,0 +1,25 @@
+# vim: set fileencoding=euc-jp
+# frozen_string_literal: false
+
+require "test/unit"
+
+class TestEUC_JP < Test::Unit::TestCase
+ def test_mbc_case_fold
+ assert_match(/()(a)\1\2/i, "aA")
+ assert_match(/()(a)\1\2/i, "aA")
+ end
+
+ def test_property
+ assert_match(/{0}\p{Hiragana}{4}/, "Ҥ餬")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "")
+ assert_no_match(/{0}\p{Katakana}{4}/, "Ҥ餬")
+ assert_match(/{0}\p{Katakana}{4}/, "")
+ assert_no_match(/{0}\p{Katakana}{4}/, "")
+ assert_raise(RegexpError) { Regexp.new('{0}\p{foobarbaz}') }
+ end
+
+ def test_charboundary
+ assert_nil(/\xA2\xA2/ =~ "\xA1\xA2\xA2\xA3")
+ end
+end
diff --git a/test/ruby/enc/test_euc_kr.rb b/test/ruby/enc/test_euc_kr.rb
new file mode 100644
index 0000000000..c9de2cc4e1
--- /dev/null
+++ b/test/ruby/enc/test_euc_kr.rb
@@ -0,0 +1,37 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestEucKr < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("euc-kr")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-kr"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+
+ def test_euro_sign
+ assert_equal("\u{20ac}", s("\xa2\xe6").encode("utf-8"))
+ end
+
+ def test_registered_mark
+ assert_equal("\u{00ae}", s("\xa2\xe7").encode("utf-8"))
+ end
+end
diff --git a/test/ruby/enc/test_euc_tw.rb b/test/ruby/enc/test_euc_tw.rb
new file mode 100644
index 0000000000..649b1b81c6
--- /dev/null
+++ b/test/ruby/enc/test_euc_tw.rb
@@ -0,0 +1,29 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestEucTw < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("euc-tw")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-tw"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_gb18030.rb b/test/ruby/enc/test_gb18030.rb
new file mode 100644
index 0000000000..76ac785951
--- /dev/null
+++ b/test/ruby/enc/test_gb18030.rb
@@ -0,0 +1,127 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestGB18030 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("gb18030")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\x81\x40").size)
+ assert_equal(1, s("\x81\x30\x81\x30").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0x8140, s("\x81\x40").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\x81\x40"), 0x8140.chr("gb18030"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\x81\x40)\\1"), "i")
+ assert_match(r, s("\x81\x40\x81\x40"))
+ end
+
+ def scheck(c, i)
+ assert_equal(s(c.reverse.take(c.size - i).join), s(c.reverse.join).chop)
+ end
+
+ def fcheck(c)
+ c = s(c.reverse.join)
+ assert_raise(ArgumentError, c) { c.chop }
+ end
+
+ def test_left_adjust_char_head
+ # C1: 00-2f, 3a-3f, 7f, ff
+ # C2: 40-7e, 80
+ # C4: 30-39
+ # CM: 81-fe
+ c1 = "\x2f"
+ c2 = "\x40"
+ c4 = "\x30"
+ cm = "\x81"
+
+ # S_START-c1
+ # S_START-c2-S_one_C2-0
+ # S_START-c2-S_one_C2-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-cm-S_odd_CM_one_CX(rec)
+ # S_START-c4-S_one_C4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c4-S_one_C4_odd_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-cm-S_odd_CM_odd_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-cm-S_odd_CM_even_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-cm-S_even_CM_one_CX(rec)
+ # S_START-cm-S_one_CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c4-S_odd_C4CM(rec)
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-cm-S_even_CM_even_C4CM(rec)
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-cm-S_even_CM_odd_C4CM(rec)
+ # S_START-cm-S_one_CM-cm-S_odd_CM_one_CX(rec)
+
+ scheck([c1], 1)
+ scheck([c2], 1)
+ scheck([c2, c1], 1)
+ scheck([c2, cm, c1], 2)
+ scheck([c2, cm, cm, c1], 1)
+ scheck([c2, cm, cm, cm], 2)
+ scheck([c4], 1)
+ scheck([c4, c1], 1)
+ scheck([c4, cm], 2)
+ scheck([c4, cm, c1], 2)
+ scheck([c4, cm, c4, c1], 2)
+ scheck([c4, cm, c4, cm], 4)
+ scheck([c4, cm, c4, cm, c1], 4)
+ scheck([c4, cm, c4, cm, c4], 4)
+ scheck([c4, cm, c4, cm, c4, c1], 4)
+ scheck([c4, cm, c4, cm, c4, cm], 2)
+ scheck([c4, cm, c4, cm, c4, cm, c1], 2)
+ scheck([c4, cm, c4, cm, c4, cm, c4], 2)
+ scheck([c4, cm, c4, cm, c4, cm, cm, c1], 4)
+ scheck([c4, cm, c4, cm, c4, cm, cm, cm], 2)
+ scheck([c4, cm, c4, cm, c4, cm, cm, cm, c1], 2)
+ scheck([c4, cm, c4, cm, c4, cm, cm, cm, cm], 4)
+ scheck([c4, cm, c4, cm, cm, c1], 2)
+ scheck([c4, cm, c4, cm, cm, cm], 4)
+ scheck([c4, cm, c4, cm, cm, cm, c1], 4)
+ scheck([c4, cm, c4, cm, cm, cm, cm], 2)
+ scheck([c4, cm, cm], 1)
+ scheck([cm], 1)
+ scheck([cm, c1], 1)
+ scheck([cm, c4, c1], 1)
+ scheck([cm, c4, cm], 3)
+ scheck([cm, c4, cm, c1], 3)
+ scheck([cm, c4, cm, c4], 3)
+ scheck([cm, c4, cm, c4, c1], 3)
+ scheck([cm, c4, cm, c4, cm], 1)
+ scheck([cm, c4, cm, c4, cm, c1], 1)
+ scheck([cm, c4, cm, c4, cm, c4], 1)
+ scheck([cm, c4, cm, c4, cm, cm, c1], 3)
+ scheck([cm, c4, cm, c4, cm, cm, cm], 1)
+ scheck([cm, c4, cm, c4, cm, cm, cm, c1], 1)
+ scheck([cm, c4, cm, c4, cm, cm, cm, cm], 3)
+ scheck([cm, c4, cm, cm, c1], 1)
+ scheck([cm, c4, cm, cm, cm], 3)
+ scheck([cm, c4, cm, cm, cm, c1], 3)
+ scheck([cm, c4, cm, cm, cm, cm], 1)
+ scheck([cm, cm], 2)
+ end
+end
diff --git a/test/ruby/enc/test_gbk.rb b/test/ruby/enc/test_gbk.rb
new file mode 100644
index 0000000000..2e541b5821
--- /dev/null
+++ b/test/ruby/enc/test_gbk.rb
@@ -0,0 +1,29 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestGBK < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("gbk")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\x81\x40").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0x8140, s("\x81\x40").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\x81\x40"), 0x8140.chr("gbk"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\x81\x40)\\1"), "i")
+ assert_match(r, s("\x81\x40\x81\x40"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\x81\x40"), s("\x81\x40\x81\x40").chop)
+ end
+end
diff --git a/test/ruby/enc/test_grapheme_breaks.rb b/test/ruby/enc/test_grapheme_breaks.rb
new file mode 100644
index 0000000000..7e6d722d40
--- /dev/null
+++ b/test/ruby/enc/test_grapheme_breaks.rb
@@ -0,0 +1,92 @@
+# frozen_string_literal: true
+# Copyright © 2018 Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class TestGraphemeBreaksFromFile < Test::Unit::TestCase
+ class BreakTest
+ attr_reader :clusters, :string, :comment, :line_number
+
+ def initialize(line_number, data, comment)
+ @line_number = line_number
+ @comment = comment
+ @clusters = data.sub(/\A\s*÷\s*/, '')
+ .sub(/\s*÷\s*\z/, '')
+ .split(/\s*÷\s*/)
+ .map do |cl|
+ cl.split(/\s*×\s*/)
+ .map do |ch|
+ c = ch.to_i(16)
+ # eliminate cases with surrogates
+ raise ArgumentError if 0xD800 <= c and c <= 0xDFFF
+ c.chr('UTF-8')
+ end.join
+ end
+ @string = @clusters.join
+ end
+ end
+
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+ UNICODE_DATA_PATH = File.directory?("#{path}/ucd/auxiliary") ? "#{path}/ucd/auxiliary" : path
+ GRAPHEME_BREAK_TEST_FILE = File.expand_path("#{UNICODE_DATA_PATH}/GraphemeBreakTest.txt", __dir__)
+
+ def self.file_available?
+ File.exist? GRAPHEME_BREAK_TEST_FILE
+ end
+
+ def test_data_files_available
+ unless TestGraphemeBreaksFromFile.file_available?
+ omit "Unicode data file GraphemeBreakTest not available in #{UNICODE_DATA_PATH}."
+ end
+ end
+
+ if file_available?
+ def read_data
+ tests = []
+ File.foreach(GRAPHEME_BREAK_TEST_FILE, encoding: Encoding::UTF_8) do |line|
+ if $. == 1 and not line.start_with?("# GraphemeBreakTest-#{UNICODE_VERSION}.txt")
+ raise "File Version Mismatch"
+ end
+ next if /\A#/.match? line
+ tests << BreakTest.new($., *line.chomp.split('#')) rescue 'whatever'
+ end
+ tests
+ end
+
+ def all_tests
+ @@tests ||= read_data
+ rescue Errno::ENOENT
+ @@tests ||= []
+ end
+
+ def test_each_grapheme_cluster
+ all_tests.each do |test|
+ expected = test.clusters
+ actual = test.string.each_grapheme_cluster.to_a
+ assert_equal expected, actual,
+ "line #{test.line_number}, expected '#{expected}', " +
+ "but got '#{actual}', comment: #{test.comment}"
+ end
+ end
+
+ def test_backslash_X
+ all_tests.each do |test|
+ clusters = test.clusters.dup
+ string = test.string.dup
+ removals = 0
+ while string.sub!(/\A\X/, '')
+ removals += 1
+ clusters.shift
+ expected = clusters.join
+ assert_equal expected, string,
+ "line #{test.line_number}, removals: #{removals}, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ assert_equal expected, string,
+ "line #{test.line_number}, after last removal, expected '#{expected}', " +
+ "but got '#{string}', comment: #{test.comment}"
+ end
+ end
+ end
+end
diff --git a/test/ruby/enc/test_iso_8859.rb b/test/ruby/enc/test_iso_8859.rb
new file mode 100644
index 0000000000..ed663be243
--- /dev/null
+++ b/test/ruby/enc/test_iso_8859.rb
@@ -0,0 +1,166 @@
+# frozen_string_literal: false
+require 'test/unit'
+
+class TestISO8859 < Test::Unit::TestCase
+ ASSERTS = %q(
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ # assert_match(/^(\xdf)\1$/i, "\xdfss") # this must be bug...
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ ((0xc0..0xde).to_a - [0xd7]).each do |c|
+ c1 = c.chr("ENCODING")
+ c2 = (c + 0x20).chr("ENCODING")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ assert_match(/^\xff$/i, "\xff")
+ )
+
+ def test_iso_8859_1
+ eval("# encoding: iso8859-1\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-1"))
+ end
+
+ def test_iso_8859_2
+ eval("# encoding: iso8859-2\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-2"))
+ end
+
+ def test_iso_8859_3
+ # todo: decide on behavior, test, and fix implementation re. İ and ı (0xA9/0xB9)
+ # treating them as case equivalents is definitely an error
+ eval(%q(# encoding: iso8859-3
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ [0xa1, 0xa6, *(0xaa..0xac), 0xaf].each do |c|
+ c1 = c.chr("iso8859-3")
+ c2 = (c + 0x10).chr("iso8859-3")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ([*(0xc0..0xde)] - [0xc3, 0xd0, 0xd7]).each do |c|
+ c1 = c.chr("iso8859-3")
+ c2 = (c + 0x20).chr("iso8859-3")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_4
+ eval("# encoding: iso8859-4\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-4"))
+ end
+
+ def test_iso_8859_5
+ eval(%q(# encoding: iso8859-5
+ (0xb0..0xcf).each do |c|
+ c1 = c.chr("iso8859-5")
+ c2 = (c + 0x20).chr("iso8859-5")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ((0xa1..0xaf).to_a - [0xad]).each do |c|
+ c1 = c.chr("iso8859-5")
+ c2 = (c + 0x50).chr("iso8859-5")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_6
+ eval(%q(# encoding: iso8859-6
+ [0xa4, 0xac, 0xbb, 0xbf, *(0xc1..0xda), *(0xe0..0xf2)].each do |c|
+ c1 = c.chr("iso8859-6")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_7
+ eval(%q(# encoding: iso8859-7
+ ((0xa0..0xfe).to_a - [0xae, 0xd2]).each do |c|
+ c1 = c.chr("iso8859-7")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ((0xc1..0xd9).to_a - [0xd2]).each do |c|
+ c1 = c.chr("iso8859-7")
+ c2 = (c + 0x20).chr("iso8859-7")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_8
+ eval(%q(# encoding: iso8859-8
+ [0xa0, *(0xa2..0xbe), *(0xdf..0xfa), 0xfc, 0xfd].each do |c|
+ c1 = c.chr("iso8859-8")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_9
+ eval(%q(# encoding: iso8859-9
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ ([*(0xc0..0xde)] - [0xd7, 0xdd]).each do |c|
+ c1 = c.chr("iso8859-9")
+ c2 = (c + 0x20).chr("iso8859-9")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_10
+ eval("# encoding: iso8859-10\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-10"))
+ end
+
+ def test_iso_8859_11
+ eval(%q(# encoding: iso8859-11
+ [*(0xa0..0xda), *(0xdf..0xfb)].each do |c|
+ c1 = c.chr("iso8859-11")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_13
+ eval("# encoding: iso8859-13\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-13"))
+ end
+
+ def test_iso_8859_14
+ eval("# encoding: iso8859-14\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-14"))
+ end
+
+ def test_iso_8859_15
+ eval("# encoding: iso8859-15\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-15"))
+ end
+
+ def test_iso_8859_16
+ eval("# encoding: iso8859-16\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-16"))
+ end
+end
+
diff --git a/test/ruby/enc/test_koi8.rb b/test/ruby/enc/test_koi8.rb
new file mode 100644
index 0000000000..4a4d233e8d
--- /dev/null
+++ b/test/ruby/enc/test_koi8.rb
@@ -0,0 +1,23 @@
+# frozen_string_literal: false
+require "test/unit"
+
+class TestKOI8 < Test::Unit::TestCase
+ ASSERTS = %q(
+ (0xc0..0xdf).each do |c|
+ c1 = c.chr("ENCODING")
+ c2 = (c + 0x20).chr("ENCODING")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ )
+
+ def test_koi8_r
+ eval("# encoding: koi8-r\n" + ASSERTS.gsub("ENCODING", "koi8-r"))
+ end
+
+ def test_koi8_u
+ eval("# encoding: koi8-u\n" + ASSERTS.gsub("ENCODING", "koi8-u"))
+ end
+end
diff --git a/test/ruby/enc/test_regex_casefold.rb b/test/ruby/enc/test_regex_casefold.rb
new file mode 100644
index 0000000000..b5d5c6e337
--- /dev/null
+++ b/test/ruby/enc/test_regex_casefold.rb
@@ -0,0 +1,120 @@
+# Copyright Kimihito Matsui (松井 仁人) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
+
+require "test/unit"
+
+class TestCaseFold < Test::Unit::TestCase
+
+ UNICODE_VERSION = RbConfig::CONFIG['UNICODE_VERSION']
+ path = File.expand_path("../../../enc/unicode/data/#{UNICODE_VERSION}", __dir__)
+ UNICODE_DATA_PATH = File.directory?("#{path}/ucd") ? "#{path}/ucd" : path
+ CaseTest = Struct.new :source, :target, :kind, :line
+
+ def check_downcase_properties(expected, start, *flags)
+ assert_equal expected, start.downcase(*flags)
+ temp = start.dup
+ assert_equal expected, temp.downcase!(*flags)
+ assert_equal expected, expected.downcase(*flags)
+ temp = expected
+ assert_nil temp.downcase!(*flags)
+ end
+
+ def read_tests
+ File.readlines("#{UNICODE_DATA_PATH}/CaseFolding.txt", encoding: Encoding::ASCII_8BIT)
+ .collect.with_index { |linedata, linenumber| [linenumber.to_i+1, linedata.chomp] }
+ .reject { |number, data| data =~ /^(#|$)/ }
+ .collect do |linenumber, linedata|
+ data, _ = linedata.split(/#\s*/)
+ code, kind, result, _ = data.split(/;\s*/)
+ CaseTest.new code.to_i(16).chr('UTF-8'),
+ result.split(/ /).collect { |hex| hex.to_i(16) }.pack('U*'),
+ kind, linenumber
+ end.select { |test| test.kind=='C' }
+ end
+
+ def to_codepoints(string)
+ string.codepoints.collect { |cp| cp.to_s(16).upcase.rjust(4, '0') }
+ end
+
+ def setup
+ @@tests ||= read_tests
+ rescue Errno::ENOENT => e
+ @@tests ||= []
+ omit e.message
+ end
+
+ def self.generate_test_casefold(encoding)
+ define_method "test_mbc_case_fold_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ assert_equal 5, "12345#{target}67890" =~ /#{source}/i,
+ "12345#{to_codepoints(target)}67890 and /#{to_codepoints(source)}/ do not match case-insensitive " +
+ "(CaseFolding.txt line #{test[:line]})"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+
+ define_method "test_get_case_fold_codes_by_str_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ assert_equal 5, "12345#{source}67890" =~ /#{target}/i,
+ "12345#{to_codepoints(source)}67890 and /#{to_codepoints(target)}/ do not match case-insensitive " +
+ "(CaseFolding.txt line #{test[:line]}), " +
+ "error may also be triggered by mbc_case_fold"
+ rescue Encoding::UndefinedConversionError
+ end
+ end
+ end
+
+ define_method "test_apply_all_case_fold_#{encoding}" do
+ @@tests.each do |test|
+ begin
+ source = test.source.encode encoding
+ target = test.target.encode encoding
+ reg = '\p{Upper}'
+ regexp = Regexp.compile reg.encode(encoding)
+ regexpi = Regexp.compile reg.encode(encoding), Regexp::IGNORECASE
+ assert_equal 5, "12345#{target}67890" =~ regexpi,
+ "12345#{to_codepoints(target)}67890 and /#{reg}/i do not match " +
+ "(CaseFolding.txt line #{test[:line]})"
+ rescue Encoding::UndefinedConversionError
+ source = source
+ regexp = regexp
+ end
+ end
+ end
+ end
+
+ def test_downcase_fold
+ @@tests.each do |test|
+ check_downcase_properties test.target, test.source, :fold
+ end
+ end
+
+ # start with good encodings only
+ generate_test_casefold 'US-ASCII'
+ generate_test_casefold 'ISO-8859-1'
+ generate_test_casefold 'ISO-8859-2'
+ generate_test_casefold 'ISO-8859-3'
+ generate_test_casefold 'ISO-8859-4'
+ generate_test_casefold 'ISO-8859-5'
+ generate_test_casefold 'ISO-8859-6'
+ # generate_test_casefold 'ISO-8859-7'
+ generate_test_casefold 'ISO-8859-8'
+ generate_test_casefold 'ISO-8859-9'
+ generate_test_casefold 'ISO-8859-10'
+ generate_test_casefold 'ISO-8859-11'
+ generate_test_casefold 'ISO-8859-13'
+ generate_test_casefold 'ISO-8859-14'
+ generate_test_casefold 'ISO-8859-15'
+ generate_test_casefold 'ISO-8859-16'
+ generate_test_casefold 'Windows-1250'
+ # generate_test_casefold 'Windows-1251'
+ generate_test_casefold 'Windows-1252'
+ generate_test_casefold 'koi8-r'
+ generate_test_casefold 'koi8-u'
+end
diff --git a/test/ruby/enc/test_shift_jis.rb b/test/ruby/enc/test_shift_jis.rb
new file mode 100644
index 0000000000..059992d167
--- /dev/null
+++ b/test/ruby/enc/test_shift_jis.rb
@@ -0,0 +1,28 @@
+# vim: set fileencoding=shift_jis
+# frozen_string_literal: false
+
+require "test/unit"
+
+class TestShiftJIS < Test::Unit::TestCase
+ def test_mbc_case_fold
+ assert_match(/()(a)\1\2/i, "aA")
+ assert_match(/()(a)\1\2/i, "a`A")
+ end
+
+ def test_property
+ assert_match(/{0}\p{Hiragana}{4}/, "Ђ炪")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "J^Ji")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "")
+ assert_no_match(/{0}\p{Katakana}{4}/, "Ђ炪")
+ assert_match(/{0}\p{Katakana}{4}/, "J^Ji")
+ assert_no_match(/{0}\p{Katakana}{4}/, "")
+ assert_raise(RegexpError) { Regexp.new('{0}\p{foobarbaz}') }
+ end
+
+ def test_code_to_mbclen
+ s = ""
+ s << 0x82a9
+ assert_equal("", s)
+ assert_raise(RangeError) { s << 0x82 }
+ end
+end
diff --git a/test/ruby/enc/test_utf16.rb b/test/ruby/enc/test_utf16.rb
new file mode 100644
index 0000000000..e08f2ea14e
--- /dev/null
+++ b/test/ruby/enc/test_utf16.rb
@@ -0,0 +1,397 @@
+# frozen_string_literal: false
+require 'test/unit'
+
+class TestUTF16 < Test::Unit::TestCase
+ def encdump(obj)
+ case obj
+ when String
+ d = obj.dump
+ if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d
+ d
+ else
+ "#{d}.force_encoding(#{obj.encoding.name.dump})"
+ end
+ when Regexp
+ "Regexp.new(#{encdump(obj.source)}, #{obj.options})"
+ else
+ raise Argument, "unexpected: #{obj.inspect}"
+ end
+ end
+
+ def enccall(recv, meth, *args)
+ desc = ''
+ if String === recv
+ desc << encdump(recv)
+ else
+ desc << recv.inspect
+ end
+ desc << '.' << meth.to_s
+ if !args.empty?
+ desc << '('
+ args.each_with_index {|a, i|
+ desc << ',' if 0 < i
+ if String === a
+ desc << encdump(a)
+ else
+ desc << a.inspect
+ end
+ }
+ desc << ')'
+ end
+ result = nil
+ assert_nothing_raised(desc) {
+ result = recv.send(meth, *args)
+ }
+ result
+ end
+
+ def assert_str_equal(expected, actual, message=nil)
+ full_message = build_message(message, <<EOT)
+#{encdump expected} expected but not equal to
+#{encdump actual}.
+EOT
+ assert_equal(expected, actual, full_message)
+ end
+
+ # tests start
+
+ def test_utf16be_valid_encoding
+ all_assertions do |a|
+ [
+ "\x00\x00",
+ "\xd7\xff",
+ "\xd8\x00\xdc\x00",
+ "\xdb\xff\xdf\xff",
+ "\xe0\x00",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ a.for(s) {
+ assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\xd8\x00",
+ "\xd8\x00\xd8\x00",
+ "\xdc\x00",
+ "\xdc\x00\xd8\x00",
+ "\xdc\x00\xdc\x00",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ a.for(s) {
+ assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ end
+ end
+
+ def test_utf16le_valid_encoding
+ all_assertions do |a|
+ [
+ "\x00\x00",
+ "\xff\xd7",
+ "\x00\xd8\x00\xdc",
+ "\xff\xdb\xff\xdf",
+ "\x00\xe0",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ a.for(s) {
+ assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\x00\xd8",
+ "\x00\xd8\x00\xd8",
+ "\x00\xdc",
+ "\x00\xdc\x00\xd8",
+ "\x00\xdc\x00\xdc",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ a.for(s) {
+ assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ end
+ end
+
+ def test_strftime
+ s = "aa".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) }
+ end
+
+ def test_intern
+ s = "aaaa".force_encoding("utf-16be")
+ assert_equal(s.encoding, s.intern.to_s.encoding, "#{encdump s}.intern.to_s.encoding")
+ end
+
+ def test_sym_eq
+ s = "aa".force_encoding("utf-16le")
+ assert_not_equal(:aa, s.intern, "#{encdump s}.intern != :aa")
+ end
+
+ def test_compatible
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "z".force_encoding("us-ascii")
+ assert_nil(Encoding.compatible?(s1, s2), "Encoding.compatible?(#{encdump s1}, #{encdump s2})")
+ end
+
+ def test_casecmp
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "AA"
+ assert_not_equal(0, s1.casecmp(s2), "#{encdump s1}.casecmp(#{encdump s2})")
+ end
+
+ def test_end_with
+ s1 = "ab".force_encoding("utf-16be")
+ s2 = "b".force_encoding("utf-16be")
+ assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})")
+ end
+
+ def test_hex
+ assert_raise(Encoding::CompatibilityError) {
+ "ff".encode("utf-16le").hex
+ }
+ assert_raise(Encoding::CompatibilityError) {
+ "ff".encode("utf-16be").hex
+ }
+ end
+
+ def test_oct
+ assert_raise(Encoding::CompatibilityError) {
+ "77".encode("utf-16le").oct
+ }
+ assert_raise(Encoding::CompatibilityError) {
+ "77".encode("utf-16be").oct
+ }
+ end
+
+ def test_count
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "aa"
+ assert_raise(Encoding::CompatibilityError, "#{encdump s1}.count(#{encdump s2})") {
+ s1.count(s2)
+ }
+ end
+
+ def test_plus
+ s1 = "a".force_encoding("us-ascii")
+ s2 = "aa".force_encoding("utf-16be")
+ assert_raise(Encoding::CompatibilityError, "#{encdump s1} + #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_encoding_find
+ assert_raise(ArgumentError) {
+ Encoding.find("utf-8".force_encoding("utf-16be"))
+ }
+ end
+
+ def test_interpolation
+ s = "aa".force_encoding("utf-16be")
+ assert_raise(Encoding::CompatibilityError, "\"a\#{#{encdump s}}\"") {
+ "a#{s}"
+ }
+ end
+
+ def test_slice!
+ enccall("aa".force_encoding("UTF-16BE"), :slice!, -1)
+ end
+
+ def test_plus_empty1
+ s1 = ""
+ s2 = "aa".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_plus_empty2
+ s1 = "aa"
+ s2 = "".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_plus_nonempty
+ s1 = "aa"
+ s2 = "bb".force_encoding("utf-16be")
+ assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_concat_empty1
+ s1 = ""
+ s2 = "aa".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_concat_empty2
+ s1 = "aa"
+ s2 = "".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_concat_nonempty
+ s1 = "aa"
+ s2 = "bb".force_encoding("utf-16be")
+ assert_raise(Encoding::CompatibilityError, "#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_chomp
+ s = "\1\n".force_encoding("utf-16be")
+ assert_equal(s, s.chomp, "#{encdump s}.chomp")
+ s = "\0\n".force_encoding("utf-16be")
+ assert_equal("", s.chomp, "#{encdump s}.chomp")
+ s = "\0\r\0\n".force_encoding("utf-16be")
+ assert_equal("", s.chomp, "#{encdump s}.chomp")
+ end
+
+ def test_succ
+ s = "\xff\xff".force_encoding("utf-16be")
+ assert_predicate(s.succ, :valid_encoding?, "#{encdump s}.succ.valid_encoding?")
+
+ s = "\xdb\xff\xdf\xff".force_encoding("utf-16be")
+ assert_predicate(s.succ, :valid_encoding?, "#{encdump s}.succ.valid_encoding?")
+ end
+
+ def test_regexp_union
+ enccall(Regexp, :union, "aa".force_encoding("utf-16be"), "bb".force_encoding("utf-16be"))
+ end
+
+ def test_empty_regexp
+ s = "".force_encoding("utf-16be")
+ assert_equal(Encoding.find("utf-16be"), Regexp.new(s).encoding,
+ "Regexp.new(#{encdump s}).encoding")
+ end
+
+ def test_regexp_match
+ assert_raise(Encoding::CompatibilityError) { Regexp.new("aa".force_encoding("utf-16be")) =~ "aa" }
+ end
+
+ def test_gsub
+ s = "abcd".force_encoding("utf-16be")
+ assert_nothing_raised {
+ s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
+ }
+ s = "ab\0\ncd".force_encoding("utf-16be")
+ assert_raise(Encoding::CompatibilityError) {
+ s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
+ }
+ end
+
+ def test_split_awk
+ s = " ab cd ".encode("utf-16be")
+ r = s.split(" ".encode("utf-16be"))
+ assert_equal(2, r.length)
+ assert_str_equal("ab".encode("utf-16be"), r[0])
+ assert_str_equal("cd".encode("utf-16be"), r[1])
+ end
+
+ def test_count2
+ e = "abc".count("^b")
+ assert_equal(e, "abc".encode("utf-16be").count("^b".encode("utf-16be")))
+ assert_equal(e, "abc".encode("utf-16le").count("^b".encode("utf-16le")))
+ end
+
+ def test_header
+ assert_raise(ArgumentError) { eval("# encoding:utf-16le\nfoo") }
+ assert_raise(ArgumentError) { eval("# encoding:utf-16be\nfoo") }
+ end
+
+
+ def test_is_mbc_newline
+ sl = "f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n\0".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n".force_encoding("utf-16be")
+ al = sl.lines.to_a
+ ab = sb.lines.to_a
+ assert_equal("f\0o\0o\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("b\0a\0r\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("b\0a\0z\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("\0f\0o\0o\0\n".force_encoding("utf-16be"), ab.shift)
+ assert_equal("\0b\0a\0r\0\n".force_encoding("utf-16be"), ab.shift)
+ assert_equal("\0b\0a\0z\0\n".force_encoding("utf-16be"), ab.shift)
+
+ sl = "f\0o\0o\0\n\0".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\0\n".force_encoding("utf-16be")
+ sl2 = "f\0o\0o\0".force_encoding("utf-16le")
+ sb2 = "\0f\0o\0o".force_encoding("utf-16be")
+ assert_equal(sl2, sl.chomp)
+ assert_equal(sl2, sl.chomp.chomp)
+ assert_equal(sb2, sb.chomp)
+ assert_equal(sb2, sb.chomp.chomp)
+
+ sl = "f\0o\0o\0\n".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\n".force_encoding("utf-16be")
+ assert_equal(sl, sl.chomp)
+ assert_equal(sb, sb.chomp)
+ end
+
+ def test_code_to_mbc
+ assert_equal("a\0".force_encoding("utf-16le"), "a".ord.chr("utf-16le"))
+ assert_equal("\0a".force_encoding("utf-16be"), "a".ord.chr("utf-16be"))
+ end
+
+ def utf8_to_utf16(s, e)
+ s.chars.map {|c| c.ord.chr(e) }.join
+ end
+
+ def test_mbc_case_fold
+ rl = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16le"), "i")
+ rb = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16be"), "i")
+ assert_equal(Encoding.find("utf-16le"), rl.encoding)
+ assert_equal(Encoding.find("utf-16be"), rb.encoding)
+ assert_match(rl, utf8_to_utf16("\u3042a\u3042a", "utf-16le"))
+ assert_match(rb, utf8_to_utf16("\u3042a\u3042a", "utf-16be"))
+ end
+
+ def test_surrogate_pair
+ sl = "\x42\xd8\xb7\xdf".force_encoding("utf-16le")
+ sb = "\xd8\x42\xdf\xb7".force_encoding("utf-16be")
+
+ assert_equal(1, sl.size)
+ assert_equal(1, sb.size)
+ assert_equal(0x20bb7, sl.ord)
+ assert_equal(0x20bb7, sb.ord)
+ assert_equal(sl, 0x20bb7.chr("utf-16le"))
+ assert_equal(sb, 0x20bb7.chr("utf-16be"))
+ assert_equal("", sl.chop)
+ assert_equal("", sb.chop)
+ end
+
+ def test_regexp_escape
+ s = "\0*".force_encoding("UTF-16BE")
+ r = Regexp.new(Regexp.escape(s))
+ assert_match(r, s, "#{encdump(r)} =~ #{encdump(s)}")
+ end
+
+ def test_casecmp2
+ assert_equal(0, "\0A".force_encoding("UTF-16BE").casecmp("\0a".force_encoding("UTF-16BE")))
+ assert_not_equal(0, "\0A".force_encoding("UTF-16LE").casecmp("\0a".force_encoding("UTF-16LE")))
+ assert_not_equal(0, "A\0".force_encoding("UTF-16BE").casecmp("a\0".force_encoding("UTF-16BE")))
+ assert_equal(0, "A\0".force_encoding("UTF-16LE").casecmp("a\0".force_encoding("UTF-16LE")))
+
+ ary = ["01".force_encoding("UTF-16LE"),
+ "10".force_encoding("UTF-16LE")]
+ e = ary.sort {|x,y| x <=> y }
+ a = ary.sort {|x,y| x.casecmp(y) }
+ assert_equal(e, a)
+ end
+end
diff --git a/test/ruby/enc/test_utf32.rb b/test/ruby/enc/test_utf32.rb
new file mode 100644
index 0000000000..76379abca0
--- /dev/null
+++ b/test/ruby/enc/test_utf32.rb
@@ -0,0 +1,162 @@
+# frozen_string_literal: false
+require 'test/unit'
+
+class TestUTF32 < Test::Unit::TestCase
+ def encdump(str)
+ d = str.dump
+ if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d
+ d
+ else
+ "#{d}.force_encoding(#{str.encoding.name.dump})"
+ end
+ end
+
+ def assert_str_equal(expected, actual, message=nil)
+ full_message = build_message(message, <<EOT)
+#{encdump expected} expected but not equal to
+#{encdump actual}.
+EOT
+ assert_equal(expected, actual, full_message)
+ end
+
+ def test_substr
+ assert_str_equal(
+ "abcdefgh".force_encoding("utf-32le"),
+ "abcdefgh".force_encoding("utf-32le")[0,3])
+ assert_str_equal(
+ "abcdefgh".force_encoding("utf-32be"),
+ "abcdefgh".force_encoding("utf-32be")[0,3])
+ end
+
+ def test_mbc_len
+ al = "abcdefghijkl".force_encoding("utf-32le").each_char.to_a
+ ab = "abcdefghijkl".force_encoding("utf-32be").each_char.to_a
+ assert_equal("abcd".force_encoding("utf-32le"), al.shift)
+ assert_equal("efgh".force_encoding("utf-32le"), al.shift)
+ assert_equal("ijkl".force_encoding("utf-32le"), al.shift)
+ assert_equal("abcd".force_encoding("utf-32be"), ab.shift)
+ assert_equal("efgh".force_encoding("utf-32be"), ab.shift)
+ assert_equal("ijkl".force_encoding("utf-32be"), ab.shift)
+ end
+
+ def ascii_to_utf16le(s)
+ s.unpack("C*").map {|x| [x,0,0,0] }.flatten.pack("C*").force_encoding("utf-32le")
+ end
+
+ def ascii_to_utf16be(s)
+ s.unpack("C*").map {|x| [0,0,0,x] }.flatten.pack("C*").force_encoding("utf-32be")
+ end
+
+ def test_mbc_newline
+ al = ascii_to_utf16le("foo\nbar\nbaz\n").lines.to_a
+ ab = ascii_to_utf16be("foo\nbar\nbaz\n").lines.to_a
+
+ assert_equal(ascii_to_utf16le("foo\n"), al.shift)
+ assert_equal(ascii_to_utf16le("bar\n"), al.shift)
+ assert_equal(ascii_to_utf16le("baz\n"), al.shift)
+ assert_equal(ascii_to_utf16be("foo\n"), ab.shift)
+ assert_equal(ascii_to_utf16be("bar\n"), ab.shift)
+ assert_equal(ascii_to_utf16be("baz\n"), ab.shift)
+
+ sl = "a\0".force_encoding("utf-32le")
+ sb = "a\0".force_encoding("utf-32be")
+ assert_equal(sl, sl.chomp)
+ assert_equal(sb, sb.chomp)
+ end
+
+ def test_mbc_to_code
+ sl = "a\0\0\0".force_encoding("utf-32le")
+ sb = "\0\0\0a".force_encoding("utf-32be")
+ assert_equal("a".ord, sl.ord)
+ assert_equal("a".ord, sb.ord)
+ end
+
+ def utf8_to_utf32(s, e)
+ s.chars.map {|c| c.ord.chr(e) }.join
+ end
+
+ def test_mbc_case_fold
+ rl = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32le"), "i")
+ rb = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32be"), "i")
+ assert_equal(Encoding.find("utf-32le"), rl.encoding)
+ assert_equal(Encoding.find("utf-32be"), rb.encoding)
+ assert_match(rl, utf8_to_utf32("\u3042a\u3042a", "utf-32le"))
+ assert_match(rb, utf8_to_utf32("\u3042a\u3042a", "utf-32be"))
+ end
+
+ def test_code_to_mbc
+ sl = "a\0\0\0".force_encoding("utf-32le")
+ sb = "\0\0\0a".force_encoding("utf-32be")
+ assert_equal(sl, "a".ord.chr("utf-32le"))
+ assert_equal(sb, "a".ord.chr("utf-32be"))
+ end
+
+ def test_utf32be_valid_encoding
+ all_assertions do |a|
+ [
+ "\x00\x00\x00\x00",
+ "\x00\x00\x00a",
+ "\x00\x00\x30\x40",
+ "\x00\x00\xd7\xff",
+ "\x00\x00\xe0\x00",
+ "\x00\x00\xff\xff",
+ "\x00\x10\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-32be")
+ a.for(s) {
+ assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ [
+ "a",
+ "\x00a",
+ "\x00\x00a",
+ "\x00\x00\xd8\x00",
+ "\x00\x00\xdb\xff",
+ "\x00\x00\xdc\x00",
+ "\x00\x00\xdf\xff",
+ "\x00\x11\x00\x00",
+ ].each {|s|
+ s.force_encoding("utf-32be")
+ a.for(s) {
+ assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ end
+ end
+
+ def test_utf32le_valid_encoding
+ all_assertions do |a|
+ [
+ "\x00\x00\x00\x00",
+ "a\x00\x00\x00",
+ "\x40\x30\x00\x00",
+ "\xff\xd7\x00\x00",
+ "\x00\xe0\x00\x00",
+ "\xff\xff\x00\x00",
+ "\xff\xff\x10\x00",
+ ].each {|s|
+ s.force_encoding("utf-32le")
+ a.for(s) {
+ assert_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ [
+ "a",
+ "a\x00",
+ "a\x00\x00",
+ "\x00\xd8\x00\x00",
+ "\xff\xdb\x00\x00",
+ "\x00\xdc\x00\x00",
+ "\xff\xdf\x00\x00",
+ "\x00\x00\x11\x00",
+ ].each {|s|
+ s.force_encoding("utf-32le")
+ a.for(s) {
+ assert_not_predicate(s, :valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ }
+ end
+ end
+end
+
diff --git a/test/ruby/enc/test_windows_1251.rb b/test/ruby/enc/test_windows_1251.rb
new file mode 100644
index 0000000000..002dbaa3cc
--- /dev/null
+++ b/test/ruby/enc/test_windows_1251.rb
@@ -0,0 +1,17 @@
+# encoding:windows-1251
+# frozen_string_literal: false
+
+require "test/unit"
+
+class TestWindows1251 < Test::Unit::TestCase
+ def test_windows_1251
+ (0xc0..0xdf).each do |c|
+ c1 = c.chr("windows-1251")
+ c2 = (c + 0x20).chr("windows-1251")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ end
+end
diff --git a/test/ruby/enc/test_windows_1252.rb b/test/ruby/enc/test_windows_1252.rb
new file mode 100644
index 0000000000..f264cba759
--- /dev/null
+++ b/test/ruby/enc/test_windows_1252.rb
@@ -0,0 +1,26 @@
+# encoding:windows-1252
+# frozen_string_literal: false
+
+require "test/unit"
+
+class TestWindows1252 < Test::Unit::TestCase
+ def test_stset
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ # assert_match(/^(\xdf)\1$/i, "\xdfss") # this must be bug...
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ end
+
+ def test_windows_1252
+ [0x8a, 0x8c, 0x8e, *0xc0..0xd6, *0xd8..0xde, 0x9f].zip([0x9a, 0x9c, 0x9e, *0xe0..0xf6, *0xf8..0xfe, 0xff]).each do |c1, c2|
+ c1 = c1.chr("windows-1252")
+ c2 = c2.chr("windows-1252")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ end
+end