summaryrefslogtreecommitdiff
path: root/test/ruby/enc
diff options
context:
space:
mode:
authornobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-07-16 19:33:15 +0000
committernobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-07-16 19:33:15 +0000
commitc60b57ade0f4b31c2177615dca9839d40baaa49d (patch)
treeaed882f80d4940a1053e56b0747cb95f2689ae2e /test/ruby/enc
parent47e3f4e1aed4b71816e05fe608e97c523ea7ccfd (diff)
* test/ruby/enc: moved tests for particular encodings.
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18095 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'test/ruby/enc')
-rw-r--r--test/ruby/enc/test_big5.rb28
-rw-r--r--test/ruby/enc/test_cp949.rb28
-rw-r--r--test/ruby/enc/test_euc_jp.rb20
-rw-r--r--test/ruby/enc/test_euc_kr.rb28
-rw-r--r--test/ruby/enc/test_euc_tw.rb28
-rw-r--r--test/ruby/enc/test_gb18030.rb125
-rw-r--r--test/ruby/enc/test_gbk.rb28
-rw-r--r--test/ruby/enc/test_iso_8859.rb163
-rw-r--r--test/ruby/enc/test_shift_jis.rb27
-rw-r--r--test/ruby/enc/test_utf16.rb358
-rw-r--r--test/ruby/enc/test_utf32.rb93
-rw-r--r--test/ruby/enc/test_windows_1251.rb16
12 files changed, 942 insertions, 0 deletions
diff --git a/test/ruby/enc/test_big5.rb b/test/ruby/enc/test_big5.rb
new file mode 100644
index 0000000000..e8fe0270a8
--- /dev/null
+++ b/test/ruby/enc/test_big5.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+class TestBig5 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("big5")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("big5"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_cp949.rb b/test/ruby/enc/test_cp949.rb
new file mode 100644
index 0000000000..e675c7b80c
--- /dev/null
+++ b/test/ruby/enc/test_cp949.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+class TestCP949 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("cp949")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("cp949"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_euc_jp.rb b/test/ruby/enc/test_euc_jp.rb
new file mode 100644
index 0000000000..82abe2116d
--- /dev/null
+++ b/test/ruby/enc/test_euc_jp.rb
@@ -0,0 +1,20 @@
+# vim: set fileencoding=euc-jp
+
+require "test/unit"
+
+class TestEUC_JP < Test::Unit::TestCase
+ def test_mbc_case_fold
+ assert_match(/(a)(a)\1\2/i, "aaaA")
+ assert_no_match(/(a)(a)\1\2/i, "aaAA")
+ end
+
+ def test_property
+ assert_match(/あ{0}\p{Hiragana}{4}/, "ひらがな")
+ assert_no_match(/あ{0}\p{Hiragana}{4}/, "カタカナ")
+ assert_no_match(/あ{0}\p{Hiragana}{4}/, "漢字漢字")
+ assert_no_match(/あ{0}\p{Katakana}{4}/, "ひらがな")
+ assert_match(/あ{0}\p{Katakana}{4}/, "カタカナ")
+ assert_no_match(/あ{0}\p{Katakana}{4}/, "漢字漢字")
+ assert_raise(RegexpError) { Regexp.new('あ{0}\p{foobarbaz}') }
+ end
+end
diff --git a/test/ruby/enc/test_euc_kr.rb b/test/ruby/enc/test_euc_kr.rb
new file mode 100644
index 0000000000..087bc795f7
--- /dev/null
+++ b/test/ruby/enc/test_euc_kr.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+class TestEucKr < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("euc-kr")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-kr"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_euc_tw.rb b/test/ruby/enc/test_euc_tw.rb
new file mode 100644
index 0000000000..f36d86b088
--- /dev/null
+++ b/test/ruby/enc/test_euc_tw.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+class TestEucTw < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("euc-tw")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\xa1\xa1").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0xa1a1, s("\xa1\xa1").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\xa1\xa1"), 0xa1a1.chr("euc-tw"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\xa1\xa1)\\1"), "i")
+ assert_match(r, s("\xa1\xa1\xa1\xa1"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\xa1\xa1"), s("\xa1\xa1\xa1\xa1").chop)
+ end
+end
diff --git a/test/ruby/enc/test_gb18030.rb b/test/ruby/enc/test_gb18030.rb
new file mode 100644
index 0000000000..a33a9eb28e
--- /dev/null
+++ b/test/ruby/enc/test_gb18030.rb
@@ -0,0 +1,125 @@
+require "test/unit"
+
+class TestGB18030 < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("gb18030")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\x81\x40").size)
+ assert_equal(1, s("\x81\x30\x81\x30").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0x8140, s("\x81\x40").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\x81\x40"), 0x8140.chr("gb18030"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\x81\x40)\\1"), "i")
+ assert_match(r, s("\x81\x40\x81\x40"))
+ end
+
+ def scheck(c, i)
+ assert_equal(s(c.reverse.take(c.size - i).join), s(c.reverse.join).chop)
+ end
+
+ def fcheck(c)
+ assert_raise(ArgumentError) { s(c.reverse.join).chop }
+ end
+
+ def test_left_adjust_char_head
+ # C1: 00-2f, 3a-3f, 7f, ff
+ # C2: 40-7e, 80
+ # C4: 30-39
+ # CM: 81-fe
+ c1 = "\x2f"
+ c2 = "\x40"
+ c4 = "\x30"
+ cm = "\x81"
+
+ # S_START-c1
+ # S_START-c2-S_one_C2-0
+ # S_START-c2-S_one_C2-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-c1
+ # S_START-c2-S_one_C2-cm-S_odd_CM_one_CX-cm-S_even_CM_one_CX-cm-S_odd_CM_one_CX(rec)
+ # S_START-c4-S_one_C4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-c4-S_one_C4_odd_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-c4-S_one_C4_even_CMC4-cm-S_odd_CMC4-cm-S_odd_CM_odd_CMC4-cm-S_even_CM_odd_CMC4-cm-S_odd_CM_odd_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-c1
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-c4-S_one_C4_odd_CMC4-cm-S_even_CMC4-cm-S_odd_CM_even_CMC4-cm-S_even_CM_even_CMC4-cm-S_odd_CM_even_CMC4(rec)
+ # S_START-c4-S_one_C4-cm-S_one_CMC4-cm-S_even_CM_one_CX(rec)
+ # S_START-cm-S_one_CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-c4-S_odd_C4CM(rec)
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-c4-S_even_C4CM-cm-S_one_CM_even_C4CM-cm-S_even_CM_even_C4CM-cm-S_odd_CM_even_C4CM-cm-S_even_CM_even_C4CM(rec)
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-c1
+ # S_START-cm-S_one_CM-c4-S_odd_C4CM-cm-S_one_CM_odd_C4CM-cm-S_even_CM_odd_C4CM-cm-S_odd_CM_odd_C4CM-cm-S_even_CM_odd_C4CM(rec)
+ # S_START-cm-S_one_CM-cm-S_odd_CM_one_CX(rec)
+
+ scheck([c1], 1)
+ scheck([c2], 1)
+ scheck([c2, c1], 1)
+ scheck([c2, cm, c1], 2)
+ scheck([c2, cm, cm, c1], 1)
+ scheck([c2, cm, cm, cm], 2)
+ scheck([c4], 1)
+ scheck([c4, c1], 1)
+ scheck([c4, cm], 2)
+ fcheck([c4, cm, c1])
+ fcheck([c4, cm, c4, c1])
+ scheck([c4, cm, c4, cm], 4)
+ scheck([c4, cm, c4, cm, c1], 4)
+ scheck([c4, cm, c4, cm, c4], 4)
+ scheck([c4, cm, c4, cm, c4, c1], 4)
+ fcheck([c4, cm, c4, cm, c4, cm])
+ fcheck([c4, cm, c4, cm, c4, cm, c1])
+ fcheck([c4, cm, c4, cm, c4, cm, c4])
+ scheck([c4, cm, c4, cm, c4, cm, cm, c1], 4)
+ fcheck([c4, cm, c4, cm, c4, cm, cm, cm])
+ fcheck([c4, cm, c4, cm, c4, cm, cm, cm, c1])
+ scheck([c4, cm, c4, cm, c4, cm, cm, cm, cm], 4)
+ fcheck([c4, cm, c4, cm, cm, c1])
+ scheck([c4, cm, c4, cm, cm, cm], 4)
+ scheck([c4, cm, c4, cm, cm, cm, c1], 4)
+ fcheck([c4, cm, c4, cm, cm, cm, cm])
+ scheck([c4, cm, cm], 1)
+ scheck([cm], 1)
+ fcheck([cm, c1])
+ fcheck([cm, c4, c1])
+ scheck([cm, c4, cm], 3)
+ fcheck([cm, c4, cm, c1])
+ fcheck([cm, c4, cm, c4])
+ fcheck([cm, c4, cm, c4, c1])
+ fcheck([cm, c4, cm, c4, cm])
+ fcheck([cm, c4, cm, c4, cm, c1])
+ fcheck([cm, c4, cm, c4, cm, c4])
+ fcheck([cm, c4, cm, c4, cm, cm, c1])
+ fcheck([cm, c4, cm, c4, cm, cm, cm])
+ fcheck([cm, c4, cm, c4, cm, cm, cm, c1])
+ fcheck([cm, c4, cm, c4, cm, cm, cm, cm])
+ fcheck([cm, c4, cm, cm, c1])
+ fcheck([cm, c4, cm, cm, cm])
+ fcheck([cm, c4, cm, cm, cm, c1])
+ fcheck([cm, c4, cm, cm, cm, cm])
+ scheck([cm, cm], 2)
+ end
+end
diff --git a/test/ruby/enc/test_gbk.rb b/test/ruby/enc/test_gbk.rb
new file mode 100644
index 0000000000..d6dc5d6d1b
--- /dev/null
+++ b/test/ruby/enc/test_gbk.rb
@@ -0,0 +1,28 @@
+require "test/unit"
+
+class TestGBK < Test::Unit::TestCase
+ def s(s)
+ s.force_encoding("gbk")
+ end
+
+ def test_mbc_enc_len
+ assert_equal(1, s("\x81\x40").size)
+ end
+
+ def test_mbc_to_code
+ assert_equal(0x8140, s("\x81\x40").ord)
+ end
+
+ def test_code_to_mbc
+ assert_equal(s("\x81\x40"), 0x8140.chr("gbk"))
+ end
+
+ def test_mbc_case_fold
+ r = Regexp.new(s("(\x81\x40)\\1"), "i")
+ assert_match(r, s("\x81\x40\x81\x40"))
+ end
+
+ def test_left_adjust_char_head
+ assert_equal(s("\x81\x40"), s("\x81\x40\x81\x40").chop)
+ end
+end
diff --git a/test/ruby/enc/test_iso_8859.rb b/test/ruby/enc/test_iso_8859.rb
new file mode 100644
index 0000000000..64cc7cd76d
--- /dev/null
+++ b/test/ruby/enc/test_iso_8859.rb
@@ -0,0 +1,163 @@
+require 'test/unit'
+
+class TestISO8859 < Test::Unit::TestCase
+ ASSERTS = %q(
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ # assert_match(/^(\xdf)\1$/i, "\xdfss") # this must be bug...
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ ((0xc0..0xde).to_a - [0xd7]).each do |c|
+ c1 = c.chr("ENCODING")
+ c2 = (c + 0x20).chr("ENCODING")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ assert_match(/^\xff$/i, "\xff")
+ )
+
+ def test_iso_8859_1
+ eval("# encoding: iso8859-1\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-1"))
+ end
+
+ def test_iso_8859_2
+ eval("# encoding: iso8859-2\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-2"))
+ end
+
+ def test_iso_8859_3
+ eval(%q(# encoding: iso8859-3
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ [0xa1, 0xa6, *(0xa9..0xac), 0xaf].each do |c|
+ c1 = c.chr("iso8859-3")
+ c2 = (c + 0x10).chr("iso8859-3")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ([*(0xc0..0xde)] - [0xc3, 0xd0, 0xd7]).each do |c|
+ c1 = c.chr("iso8859-3")
+ c2 = (c + 0x20).chr("iso8859-3")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_4
+ eval("# encoding: iso8859-4\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-4"))
+ end
+
+ def test_iso_8859_5
+ eval(%q(# encoding: iso8859-5
+ (0xb0..0xcf).each do |c|
+ c1 = c.chr("iso8859-5")
+ c2 = (c + 0x20).chr("iso8859-5")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ((0xa1..0xaf).to_a - [0xad]).each do |c|
+ c1 = c.chr("iso8859-5")
+ c2 = (c + 0x50).chr("iso8859-5")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_6
+ eval(%q(# encoding: iso8859-6
+ [0xa4, 0xac, 0xbb, 0xbf, *(0xc1..0xda), *(0xe0..0xf2)].each do |c|
+ c1 = c.chr("iso8859-6")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_7
+ eval(%q(# encoding: iso8859-7
+ ((0xa0..0xfe).to_a - [0xae, 0xd2]).each do |c|
+ c1 = c.chr("iso8859-7")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ((0xc1..0xd9).to_a - [0xd2]).each do |c|
+ c1 = c.chr("iso8859-7")
+ c2 = (c + 0x20).chr("iso8859-7")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_8
+ eval(%q(# encoding: iso8859-8
+ [0xa0, *(0xa2..0xbe), *(0xdf..0xfa), 0xfc, 0xfd].each do |c|
+ c1 = c.chr("iso8859-8")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_9
+ eval(%q(# encoding: iso8859-9
+ assert_match(/^(\xdf)\1$/i, "\xdf\xdf")
+ assert_match(/^(\xdf)\1$/i, "ssss")
+ assert_match(/^[\xdfz]+$/i, "sszzsszz")
+ assert_match(/^SS$/i, "\xdf")
+ assert_match(/^Ss$/i, "\xdf")
+ ([*(0xc0..0xdc)] - [0xd7]).each do |c|
+ c1 = c.chr("iso8859-9")
+ c2 = (c + 0x20).chr("iso8859-9")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ ))
+ end
+
+ def test_iso_8859_10
+ eval("# encoding: iso8859-10\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-10"))
+ end
+
+ def test_iso_8859_11
+ eval(%q(# encoding: iso8859-11
+ [*(0xa0..0xda), *(0xdf..0xfb)].each do |c|
+ c1 = c.chr("iso8859-11")
+ assert_match(/^(#{ c1 })\1$/i, c1 * 2)
+ end
+ ))
+ end
+
+ def test_iso_8859_13
+ eval("# encoding: iso8859-13\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-13"))
+ end
+
+ def test_iso_8859_14
+ eval("# encoding: iso8859-14\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-14"))
+ end
+
+ def test_iso_8859_15
+ eval("# encoding: iso8859-15\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-15"))
+ end
+
+ def test_iso_8859_16
+ eval("# encoding: iso8859-16\n" + ASSERTS.gsub(/ENCODING/m, "iso8859-16"))
+ end
+end
+
diff --git a/test/ruby/enc/test_shift_jis.rb b/test/ruby/enc/test_shift_jis.rb
new file mode 100644
index 0000000000..f81cb7801c
--- /dev/null
+++ b/test/ruby/enc/test_shift_jis.rb
@@ -0,0 +1,27 @@
+# vim: set fileencoding=shift_jis
+
+require "test/unit"
+
+class TestShiftJIS < Test::Unit::TestCase
+ def test_mbc_case_fold
+ assert_match(/()(a)\1\2/i, "aA")
+ assert_no_match(/()(a)\1\2/i, "a`A")
+ end
+
+ def test_property
+ assert_match(/{0}\p{Hiragana}{4}/, "")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "J^Ji")
+ assert_no_match(/{0}\p{Hiragana}{4}/, "")
+ assert_no_match(/{0}\p{Katakana}{4}/, "")
+ assert_match(/{0}\p{Katakana}{4}/, "J^Ji")
+ assert_no_match(/{0}\p{Katakana}{4}/, "")
+ assert_raise(RegexpError) { Regexp.new('{0}\p{foobarbaz}') }
+ end
+
+ def test_code_to_mbclen
+ s = ""
+ s << 0x82a9
+ assert_equal("", s)
+ assert_raise(ArgumentError) { s << 0x82 }
+ end
+end
diff --git a/test/ruby/enc/test_utf16.rb b/test/ruby/enc/test_utf16.rb
new file mode 100644
index 0000000000..c10463b2b3
--- /dev/null
+++ b/test/ruby/enc/test_utf16.rb
@@ -0,0 +1,358 @@
+require 'test/unit'
+
+class TestUTF16 < Test::Unit::TestCase
+ def encdump(str)
+ d = str.dump
+ if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d
+ d
+ else
+ "#{d}.force_encoding(#{str.encoding.name.dump})"
+ end
+ end
+
+ def enccall(recv, meth, *args)
+ desc = ''
+ if String === recv
+ desc << encdump(recv)
+ else
+ desc << recv.inspect
+ end
+ desc << '.' << meth.to_s
+ if !args.empty?
+ desc << '('
+ args.each_with_index {|a, i|
+ desc << ',' if 0 < i
+ if String === a
+ desc << encdump(a)
+ else
+ desc << a.inspect
+ end
+ }
+ desc << ')'
+ end
+ result = nil
+ assert_nothing_raised(desc) {
+ result = recv.send(meth, *args)
+ }
+ result
+ end
+
+ def assert_str_equal(expected, actual, message=nil)
+ full_message = build_message(message, <<EOT)
+#{encdump expected} expected but not equal to
+#{encdump actual}.
+EOT
+ assert_block(full_message) { expected == actual }
+ end
+
+ # tests start
+
+ def test_utf16be_valid_encoding
+ [
+ "\x00\x00",
+ "\xd7\xff",
+ "\xd8\x00\xdc\x00",
+ "\xdb\xff\xdf\xff",
+ "\xe0\x00",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\xd8\x00",
+ "\xd8\x00\xd8\x00",
+ "\xdc\x00",
+ "\xdc\x00\xd8\x00",
+ "\xdc\x00\xdc\x00",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16be")
+ assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ end
+
+ def test_utf16le_valid_encoding
+ [
+ "\x00\x00",
+ "\xff\xd7",
+ "\x00\xd8\x00\xdc",
+ "\xff\xdb\xff\xdf",
+ "\x00\xe0",
+ "\xff\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ assert_equal(true, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ [
+ "\x00",
+ "\xd7",
+ "\x00\xd8",
+ "\x00\xd8\x00\xd8",
+ "\x00\xdc",
+ "\x00\xdc\x00\xd8",
+ "\x00\xdc\x00\xdc",
+ "\xe0",
+ "\xff",
+ ].each {|s|
+ s.force_encoding("utf-16le")
+ assert_equal(false, s.valid_encoding?, "#{encdump s}.valid_encoding?")
+ }
+ end
+
+ def test_strftime
+ s = "aa".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "Time.now.strftime(#{encdump s})") { Time.now.strftime(s) }
+ end
+
+ def test_intern
+ s = "aaaa".force_encoding("utf-16be")
+ assert_equal(s.encoding, s.intern.to_s.encoding, "#{encdump s}.intern.to_s.encoding")
+ end
+
+ def test_sym_eq
+ s = "aa".force_encoding("utf-16le")
+ assert(s.intern != :aa, "#{encdump s}.intern != :aa")
+ end
+
+ def test_compatible
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "z".force_encoding("us-ascii")
+ assert_nil(Encoding.compatible?(s1, s2), "Encoding.compatible?(#{encdump s1}, #{encdump s2})")
+ end
+
+ def test_casecmp
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "AA"
+ assert_not_equal(0, s1.casecmp(s2), "#{encdump s1}.casecmp(#{encdump s2})")
+ end
+
+ def test_end_with
+ s1 = "ab".force_encoding("utf-16be")
+ s2 = "b".force_encoding("utf-16be")
+ assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})")
+ end
+
+ def test_hex
+ assert_raise(ArgumentError) {
+ "ff".encode("utf-16le").hex
+ }
+ assert_raise(ArgumentError) {
+ "ff".encode("utf-16be").hex
+ }
+ end
+
+ def test_oct
+ assert_raise(ArgumentError) {
+ "77".encode("utf-16le").oct
+ }
+ assert_raise(ArgumentError) {
+ "77".encode("utf-16be").oct
+ }
+ end
+
+ def test_count
+ s1 = "aa".force_encoding("utf-16be")
+ s2 = "aa"
+ assert_raise(ArgumentError, "#{encdump s1}.count(#{encdump s2})") {
+ s1.count(s2)
+ }
+ end
+
+ def test_plus
+ s1 = "a".force_encoding("us-ascii")
+ s2 = "aa".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "#{encdump s1} + #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_encoding_find
+ assert_raise(ArgumentError) {
+ Encoding.find("utf-8".force_encoding("utf-16be"))
+ }
+ end
+
+ def test_interpolation
+ s = "aa".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "\"a\#{#{encdump s}}\"") {
+ "a#{s}"
+ }
+ end
+
+ def test_slice!
+ enccall("aa".force_encoding("UTF-16BE"), :slice!, -1)
+ end
+
+ def test_plus_empty1
+ s1 = ""
+ s2 = "aa".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_plus_empty2
+ s1 = "aa"
+ s2 = "".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_plus_nonempty
+ s1 = "aa"
+ s2 = "bb".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "#{encdump s1} << #{encdump s2}") {
+ s1 + s2
+ }
+ end
+
+ def test_concat_empty1
+ s1 = ""
+ s2 = "aa".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_concat_empty2
+ s1 = "aa"
+ s2 = "".force_encoding("utf-16be")
+ assert_nothing_raised("#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_concat_nonempty
+ s1 = "aa"
+ s2 = "bb".force_encoding("utf-16be")
+ assert_raise(ArgumentError, "#{encdump s1} << #{encdump s2}") {
+ s1 << s2
+ }
+ end
+
+ def test_chomp
+ s = "\1\n".force_encoding("utf-16be")
+ assert_equal(s, s.chomp, "#{encdump s}.chomp")
+ s = "\0\n".force_encoding("utf-16be")
+ assert_equal("", s.chomp, "#{encdump s}.chomp")
+ s = "\0\r\0\n".force_encoding("utf-16be")
+ assert_equal("", s.chomp, "#{encdump s}.chomp")
+ end
+
+ def test_succ
+ s = "\xff\xff".force_encoding("utf-16be")
+ assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?")
+
+ s = "\xdb\xff\xdf\xff".force_encoding("utf-16be")
+ assert(s.succ.valid_encoding?, "#{encdump s}.succ.valid_encoding?")
+ end
+
+ def test_regexp_union
+ enccall(Regexp, :union, "aa".force_encoding("utf-16be"), "bb".force_encoding("utf-16be"))
+ end
+
+ def test_empty_regexp
+ s = "".force_encoding("utf-16be")
+ assert_equal(Encoding.find("utf-16be"), Regexp.new(s).encoding,
+ "Regexp.new(#{encdump s}).encoding")
+ end
+
+ def test_regexp_match
+ assert_raise(ArgumentError) { Regexp.new("aa".force_encoding("utf-16be")) =~ "aa" }
+ end
+
+ def test_gsub
+ s = "abcd".force_encoding("utf-16be")
+ assert_nothing_raised {
+ s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
+ }
+ s = "ab\0\ncd".force_encoding("utf-16be")
+ assert_raise(ArgumentError) {
+ s.gsub(Regexp.new(".".encode("utf-16be")), "xy")
+ }
+ end
+
+ def test_split_awk
+ s = " ab cd ".encode("utf-16be")
+ r = s.split(" ".encode("utf-16be"))
+ assert_equal(2, r.length)
+ assert_str_equal("ab".encode("utf-16be"), r[0])
+ assert_str_equal("cd".encode("utf-16be"), r[1])
+ end
+
+ def test_count2
+ e = "abc".count("^b")
+ assert_equal(e, "abc".encode("utf-16be").count("^b".encode("utf-16be")))
+ assert_equal(e, "abc".encode("utf-16le").count("^b".encode("utf-16le")))
+ end
+
+ def test_header
+ assert_raise(ArgumentError) { eval("# encoding:utf-16le\nfoo") }
+ assert_raise(ArgumentError) { eval("# encoding:utf-16be\nfoo") }
+ end
+
+
+ def test_is_mbc_newline
+ sl = "f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n\0".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\0\n\0b\0a\0r\0\n\0b\0a\0z\0\n".force_encoding("utf-16be")
+ al = sl.lines.to_a
+ ab = sb.lines.to_a
+ assert_equal("f\0o\0o\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("b\0a\0r\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("b\0a\0z\0\n\0".force_encoding("utf-16le"), al.shift)
+ assert_equal("\0f\0o\0o\0\n".force_encoding("utf-16be"), ab.shift)
+ assert_equal("\0b\0a\0r\0\n".force_encoding("utf-16be"), ab.shift)
+ assert_equal("\0b\0a\0z\0\n".force_encoding("utf-16be"), ab.shift)
+
+ sl = "f\0o\0o\0\n\0".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\0\n".force_encoding("utf-16be")
+ sl2 = "f\0o\0o\0".force_encoding("utf-16le")
+ sb2 = "\0f\0o\0o".force_encoding("utf-16be")
+ assert_equal(sl2, sl.chomp)
+ assert_equal(sl2, sl.chomp.chomp)
+ assert_equal(sb2, sb.chomp)
+ assert_equal(sb2, sb.chomp.chomp)
+
+ sl = "f\0o\0o\0\n".force_encoding("utf-16le")
+ sb = "\0f\0o\0o\n".force_encoding("utf-16be")
+ assert_equal(sl, sl.chomp)
+ assert_equal(sb, sb.chomp)
+ end
+
+ def test_code_to_mbc
+ assert_equal("a\0".force_encoding("utf-16le"), "a".ord.chr("utf-16le"))
+ assert_equal("\0a".force_encoding("utf-16be"), "a".ord.chr("utf-16be"))
+ end
+
+ def utf8_to_utf16(s, e)
+ s.chars.map {|c| c.ord.chr(e) }.join
+ end
+
+ def test_mbc_case_fold
+ rl = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16le"), "i")
+ rb = Regexp.new(utf8_to_utf16("^(\u3042)(a)\\1\\2$", "utf-16be"), "i")
+ assert_equal(Encoding.find("utf-16le"), rl.encoding)
+ assert_equal(Encoding.find("utf-16be"), rb.encoding)
+ assert_match(rl, utf8_to_utf16("\u3042a\u3042a", "utf-16le"))
+ assert_match(rb, utf8_to_utf16("\u3042a\u3042a", "utf-16be"))
+ end
+
+ def test_surrogate_pair
+ sl = "\x42\xd8\xb7\xdf".force_encoding("utf-16le")
+ sb = "\xd8\x42\xdf\xb7".force_encoding("utf-16be")
+
+ assert_equal(1, sl.size)
+ assert_equal(1, sb.size)
+ assert_equal(0x20bb7, sl.ord)
+ assert_equal(0x20bb7, sb.ord)
+ assert_equal(sl, 0x20bb7.chr("utf-16le"))
+ assert_equal(sb, 0x20bb7.chr("utf-16be"))
+ assert_equal("", sl.chop)
+ assert_equal("", sb.chop)
+ end
+end
diff --git a/test/ruby/enc/test_utf32.rb b/test/ruby/enc/test_utf32.rb
new file mode 100644
index 0000000000..3d4a458512
--- /dev/null
+++ b/test/ruby/enc/test_utf32.rb
@@ -0,0 +1,93 @@
+require 'test/unit'
+
+class TestUTF32 < Test::Unit::TestCase
+ def encdump(str)
+ d = str.dump
+ if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d
+ d
+ else
+ "#{d}.force_encoding(#{str.encoding.name.dump})"
+ end
+ end
+
+ def assert_str_equal(expected, actual, message=nil)
+ full_message = build_message(message, <<EOT)
+#{encdump expected} expected but not equal to
+#{encdump actual}.
+EOT
+ assert_block(full_message) { expected == actual }
+ end
+
+ def test_substr
+ assert_str_equal(
+ "abcdefgh".force_encoding("utf-32le"),
+ "abcdefgh".force_encoding("utf-32le")[0,3])
+ assert_str_equal(
+ "abcdefgh".force_encoding("utf-32be"),
+ "abcdefgh".force_encoding("utf-32be")[0,3])
+ end
+
+ def test_mbc_len
+ al = "abcdefghijkl".force_encoding("utf-32le").each_char.to_a
+ ab = "abcdefghijkl".force_encoding("utf-32be").each_char.to_a
+ assert_equal("abcd".force_encoding("utf-32le"), al.shift)
+ assert_equal("efgh".force_encoding("utf-32le"), al.shift)
+ assert_equal("ijkl".force_encoding("utf-32le"), al.shift)
+ assert_equal("abcd".force_encoding("utf-32be"), ab.shift)
+ assert_equal("efgh".force_encoding("utf-32be"), ab.shift)
+ assert_equal("ijkl".force_encoding("utf-32be"), ab.shift)
+ end
+
+ def ascii_to_utf16le(s)
+ s.unpack("C*").map {|x| [x,0,0,0] }.flatten.pack("C*").force_encoding("utf-32le")
+ end
+
+ def ascii_to_utf16be(s)
+ s.unpack("C*").map {|x| [0,0,0,x] }.flatten.pack("C*").force_encoding("utf-32be")
+ end
+
+ def test_mbc_newline
+ al = ascii_to_utf16le("foo\nbar\nbaz\n").lines.to_a
+ ab = ascii_to_utf16be("foo\nbar\nbaz\n").lines.to_a
+
+ assert_equal(ascii_to_utf16le("foo\n"), al.shift)
+ assert_equal(ascii_to_utf16le("bar\n"), al.shift)
+ assert_equal(ascii_to_utf16le("baz\n"), al.shift)
+ assert_equal(ascii_to_utf16be("foo\n"), ab.shift)
+ assert_equal(ascii_to_utf16be("bar\n"), ab.shift)
+ assert_equal(ascii_to_utf16be("baz\n"), ab.shift)
+
+ sl = "a\0".force_encoding("utf-32le")
+ sb = "a\0".force_encoding("utf-32be")
+ assert_equal(sl, sl.chomp)
+ assert_equal(sb, sb.chomp)
+ end
+
+ def test_mbc_to_code
+ sl = "a\0\0\0".force_encoding("utf-32le")
+ sb = "\0\0\0a".force_encoding("utf-32be")
+ assert_equal("a".ord, sl.ord)
+ assert_equal("a".ord, sb.ord)
+ end
+
+ def utf8_to_utf32(s, e)
+ s.chars.map {|c| c.ord.chr(e) }.join
+ end
+
+ def test_mbc_case_fold
+ rl = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32le"), "i")
+ rb = Regexp.new(utf8_to_utf32("^(\u3042)(a)\\1\\2$", "utf-32be"), "i")
+ assert_equal(Encoding.find("utf-32le"), rl.encoding)
+ assert_equal(Encoding.find("utf-32be"), rb.encoding)
+ assert_match(rl, utf8_to_utf32("\u3042a\u3042a", "utf-32le"))
+ assert_match(rb, utf8_to_utf32("\u3042a\u3042a", "utf-32be"))
+ end
+
+ def test_code_to_mbc
+ sl = "a\0\0\0".force_encoding("utf-32le")
+ sb = "\0\0\0a".force_encoding("utf-32be")
+ assert_equal(sl, "a".ord.chr("utf-32le"))
+ assert_equal(sb, "a".ord.chr("utf-32be"))
+ end
+end
+
diff --git a/test/ruby/enc/test_windows_1251.rb b/test/ruby/enc/test_windows_1251.rb
new file mode 100644
index 0000000000..6fbf3159a1
--- /dev/null
+++ b/test/ruby/enc/test_windows_1251.rb
@@ -0,0 +1,16 @@
+# encoding:windows-1251
+
+require "test/unit"
+
+class TestWindows1251 < Test::Unit::TestCase
+ def test_windows_1251
+ (0xc0..0xdf).each do |c|
+ c1 = c.chr("windows-1251")
+ c2 = (c + 0x20).chr("windows-1251")
+ assert_match(/^(#{ c1 })\1$/i, c2 + c1)
+ assert_match(/^(#{ c2 })\1$/i, c1 + c2)
+ assert_match(/^[#{ c1 }]+$/i, c2 + c1)
+ assert_match(/^[#{ c2 }]+$/i, c1 + c2)
+ end
+ end
+end