diff options
Diffstat (limited to 'test/ruby/test_transcode.rb')
| -rw-r--r-- | test/ruby/test_transcode.rb | 96 |
1 files changed, 88 insertions, 8 deletions
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 0ec9e6e7b9..24ee9b9533 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -13,8 +13,8 @@ class TestTranscode < Test::Unit::TestCase assert_raise(Encoding::UndefinedConversionError) { "\x80".encode('utf-8','ASCII-8BIT') } assert_raise(Encoding::InvalidByteSequenceError) { "\x80".encode('utf-8','US-ASCII') } assert_raise(Encoding::UndefinedConversionError) { "\xA5".encode('utf-8','iso-8859-3') } - assert_raise(RuntimeError) { 'hello'.freeze.encode!('iso-8859-1') } - assert_raise(RuntimeError) { '\u3053\u3093\u306b\u3061\u306f'.freeze.encode!('iso-8859-1') } # こんにちは + assert_raise(FrozenError) { 'hello'.freeze.encode!('iso-8859-1') } + assert_raise(FrozenError) { '\u3053\u3093\u306b\u3061\u306f'.freeze.encode!('iso-8859-1') } # こんにちは end def test_arguments @@ -126,6 +126,28 @@ class TestTranscode < Test::Unit::TestCase assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1')) end + def test_encode_xml_multibyte + encodings = %w'UTF-8 UTF-16LE UTF-16BE UTF-32LE UTF-32BE' + encodings.each do |src_enc| + encodings.each do |dst_enc| + escaped = "<>".encode(src_enc).encode(dst_enc, :xml=>:text) + assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text") + + escaped = '<">'.encode(src_enc).encode(dst_enc, :xml=>:attr) + assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr") + + escaped = "<>".encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:text) + assert_equal("<>", escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :text") + + escaped = '<">'.encode(src_enc).force_encoding("UTF-8").encode(dst_enc, src_enc, :xml=>:attr) + assert_equal('"<">"', escaped.encode('UTF-8'), "failed encoding #{src_enc} to #{dst_enc} with xml: :attr") + end + end + # regression test; U+6E7F (湿) uses the same bytes in ISO-2022-JP as "<>" + assert_equal( "<>\u6E7F", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:text).encode("UTF-8")) + assert_equal("\"<>\u6E7F\"", "<>\u6E7F".encode("ISO-2022-JP").encode("ISO-2022-JP", :xml=>:attr).encode("UTF-8")) + end + def test_ascii_range encodings = [ 'US-ASCII', 'ASCII-8BIT', @@ -469,6 +491,25 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("\u00A0", "\xFF", 'IBM437') # non-breaking space end + def test_IBM720 + assert_raise(Encoding::UndefinedConversionError) { "\x80".encode("utf-8", 'IBM720') } + assert_raise(Encoding::UndefinedConversionError) { "\x8F".encode("utf-8", 'IBM720') } + assert_raise(Encoding::UndefinedConversionError) { "\x90".encode("utf-8", 'IBM720') } + check_both_ways("\u0627", "\x9F", 'IBM720') # ا + check_both_ways("\u0628", "\xA0", 'IBM720') # ب + check_both_ways("\u00BB", "\xAF", 'IBM720') # » + check_both_ways("\u2591", "\xB0", 'IBM720') # ░ + check_both_ways("\u2510", "\xBF", 'IBM720') # ┐ + check_both_ways("\u2514", "\xC0", 'IBM720') # └ + check_both_ways("\u2567", "\xCF", 'IBM720') # ╧ + check_both_ways("\u2568", "\xD0", 'IBM720') # ╨ + check_both_ways("\u2580", "\xDF", 'IBM720') # ▀ + check_both_ways("\u0636", "\xE0", 'IBM720') # ض + check_both_ways("\u064A", "\xEF", 'IBM720') # ي + check_both_ways("\u2261", "\xF0", 'IBM720') # ≡ + check_both_ways("\u00A0", "\xFF", 'IBM720') # non-breaking space + end + def test_IBM775 check_both_ways("\u0106", "\x80", 'IBM775') # Ć check_both_ways("\u00C5", "\x8F", 'IBM775') # Å @@ -2116,6 +2157,28 @@ class TestTranscode < Test::Unit::TestCase check_both_ways("D\u00FCrst", "\xC4\xDC\x99\xA2\xA3", 'IBM037') # Dürst end + def test_CESU_8 + check_both_ways("aijrszAIJRSZ09", "aijrszAIJRSZ09", 'CESU-8') # single bytes + + # check NULL explicitly + # this is different in CESU-8 and in Java modified UTF-8 strings + check_both_ways("\0", "\0", 'CESU-8') + + # U+0080 U+00FC U+00FF U+0100 U+0400 U+0700 U+07FF + two_byte_chars = "\xC2\x80\x20\xC3\xBC\x20\xC3\xBF\x20\xC4\x80\x20\xD0\x80\x20\xDC\x80\x20\xDF\xBF" + check_both_ways(two_byte_chars, two_byte_chars, 'CESU-8') + + # U+0800 U+2200 U+4E00 U+D7FF U+E000 U+FFFF + three_byte_chars = "\xE0\xA0\x80\x20\xE2\x88\x80\x20\xE4\xB8\x80\x20\xED\x9F\xBF\x20\xEE\x80\x80\x20\xEF\xBF\xBF" + check_both_ways(three_byte_chars, three_byte_chars, 'CESU-8') + + # characters outside BMP (double surrogates in CESU-8) + # U+10000 U+20000 U+50000 U+10FFFF + utf8 = "\xF0\x90\x80\x80 \xF0\xA0\x80\x80 \xF1\x90\x80\x80 \xF4\x8F\xBF\xBF" + cesu = "\xED\xA0\x80\xED\xB0\x80 \xED\xA1\x80\xED\xB0\x80 \xED\xA4\x80\xED\xB0\x80 \xED\xAF\xBF\xED\xBF\xBF" + check_both_ways(utf8, cesu, 'CESU-8') + end + def test_nothing_changed a = "James".force_encoding("US-ASCII") b = a.encode("Shift_JIS") @@ -2161,12 +2224,20 @@ class TestTranscode < Test::Unit::TestCase assert_equal("U+3042", "\u{3042}".encode("US-ASCII", fallback: fallback.method(:escape))) end - bug8940 = '[ruby-core:57318] [Bug #8940]' - %w[UTF-32 UTF-16].each do |enc| - define_method("test_pseudo_encoding_inspect(#{enc})") do - assert_normal_exit("'aaa'.encode('#{enc}').inspect", bug8940) - assert_equal(4, 'aaa'.encode(enc).length, "should count in #{enc} with BOM") + def test_fallback_aref + fallback = Object.new + def fallback.[](x) + "U+%.4X" % x.unpack("U") end + assert_equal("U+3042", "\u{3042}".encode("US-ASCII", fallback: fallback)) + end + + def test_pseudo_encoding_inspect + s = 'aaa'.encode "UTF-16" + assert_equal '"\xFE\xFF\x00\x61\x00\x61\x00\x61"', s.inspect + + s = 'aaa'.encode "UTF-32" + assert_equal '"\x00\x00\xFE\xFF\x00\x00\x00\x61\x00\x00\x00\x61\x00\x00\x00\x61"', s.inspect end def test_encode_with_invalid_chars @@ -2220,12 +2291,21 @@ class TestTranscode < Test::Unit::TestCase "#{bug} coderange should not have side effects") end - def test_universal_newline + def test_newline_options bug11324 = '[ruby-core:69841] [Bug #11324]' usascii = Encoding::US_ASCII s = "A\nB\r\nC".force_encoding(usascii) assert_equal("A\nB\nC", s.encode(usascii, universal_newline: true), bug11324) assert_equal("A\nB\nC", s.encode(usascii, universal_newline: true, undef: :replace), bug11324) assert_equal("A\nB\nC", s.encode(usascii, universal_newline: true, undef: :replace, replace: ''), bug11324) + assert_equal("A\nB\nC", s.encode(usascii, newline: :universal)) + assert_equal("A\nB\nC", s.encode(usascii, newline: :universal, undef: :replace)) + assert_equal("A\nB\nC", s.encode(usascii, newline: :universal, undef: :replace, replace: '')) + assert_equal("A\rB\r\rC", s.encode(usascii, cr_newline: true)) + assert_equal("A\rB\r\rC", s.encode(usascii, newline: :cr)) + assert_equal("A\r\nB\r\r\nC", s.encode(usascii, crlf_newline: true)) + assert_equal("A\r\nB\r\r\nC", s.encode(usascii, newline: :crlf)) + assert_equal("A\nB\nC", s.encode(usascii, lf_newline: true)) + assert_equal("A\nB\nC", s.encode(usascii, newline: :lf)) end end |
