diff options
Diffstat (limited to 'test/ruby/test_m17n.rb')
| -rw-r--r-- | test/ruby/test_m17n.rb | 1733 |
1 files changed, 1733 insertions, 0 deletions
diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb new file mode 100644 index 0000000000..9f7a3c7f4b --- /dev/null +++ b/test/ruby/test_m17n.rb @@ -0,0 +1,1733 @@ +# coding: US-ASCII +# frozen_string_literal: false +require 'test/unit' + +class TestM17N < Test::Unit::TestCase + def assert_encoding(encname, actual, message=nil) + assert_equal(Encoding.find(encname), actual, message) + end + + module AESU + def ua(str) str.dup.force_encoding("US-ASCII") end + def a(str) str.b end + def e(str) str.dup.force_encoding("EUC-JP") end + def s(str) str.dup.force_encoding("Windows-31J") end + def u(str) str.dup.force_encoding("UTF-8") end + end + include AESU + extend AESU + + def assert_strenc(bytes, enc, actual, message=nil) + assert_instance_of(String, actual, message) + enc = Encoding.find(enc) if String === enc + assert_equal(enc, actual.encoding, message) + assert_equal(a(bytes), a(actual), message) + end + + def assert_regexp_generic_encoding(r) + assert_not_predicate(r, :fixed_encoding?) + %w[ASCII-8BIT EUC-JP Windows-31J UTF-8].each {|ename| + # "\xc2\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Windows-31J and UTF-8. + assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(ename) } + } + end + + def assert_regexp_fixed_encoding(r) + assert_predicate(r, :fixed_encoding?) + %w[ASCII-8BIT EUC-JP Windows-31J UTF-8].each {|ename| + enc = Encoding.find(ename) + if enc == r.encoding + assert_nothing_raised { r =~ "\xc2\xa1".force_encoding(enc) } + else + assert_raise(Encoding::CompatibilityError) { r =~ "\xc2\xa1".force_encoding(enc) } + end + } + end + + def assert_regexp_generic_ascii(r) + assert_encoding("US-ASCII", r.encoding) + assert_regexp_generic_encoding(r) + end + + def assert_regexp_fixed_ascii8bit(r) + assert_encoding("ASCII-8BIT", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_eucjp(r) + assert_encoding("EUC-JP", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_sjis(r) + assert_encoding("Windows-31J", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def assert_regexp_fixed_utf8(r) + assert_encoding("UTF-8", r.encoding) + assert_regexp_fixed_encoding(r) + end + + def encdump(str) + d = str.dump + if /\.force_encoding\("[A-Za-z0-9.:_+-]*"\)\z/ =~ d + d + else + "#{d}.force_encoding(#{str.encoding.name.dump})" + end + end + + def encdumpargs(args) + r = '(' + args.each_with_index {|a, i| + r << ',' if 0 < i + if String === a + r << encdump(a) + else + r << a.inspect + end + } + r << ')' + r + end + + def assert_str_enc_propagation(t, s1, s2) + if !s1.ascii_only? + assert_equal(s1.encoding, t.encoding) + elsif !s2.ascii_only? + assert_equal(s2.encoding, t.encoding) + else + assert_include([s1.encoding, s2.encoding], t.encoding) + end + end + + def assert_same_result(expected_proc, actual_proc) + e = nil + begin + t = expected_proc.call + rescue + e = $! + end + if e + assert_raise(e.class) { actual_proc.call } + else + assert_equal(t, actual_proc.call) + end + end + + def str_enc_compatible?(*strs) + encs = [] + strs.each {|s| + encs << s.encoding if !s.ascii_only? + } + encs.uniq! + encs.length <= 1 + end + + # tests start + + def test_string_ascii_literal + assert_encoding("ASCII-8BIT", eval(a(%{""})).encoding) + assert_encoding("ASCII-8BIT", eval(a(%{"a"})).encoding) + end + + def test_string_eucjp_literal + assert_encoding("EUC-JP", eval(e(%{""})).encoding) + assert_encoding("EUC-JP", eval(e(%{"a"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\xa1\xa1"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\xa1\\xa1"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\x20"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\n"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding) + end + + def test_utf8_literal + assert_equal(Encoding::UTF_8, "\u3042".encoding, "[ruby-dev:33406] \"\\u3042\".encoding") + assert_raise(SyntaxError) { eval(a('\u3052\x80')) } + end + + def test_string_mixed_unicode + assert_raise(SyntaxError) { eval(a(%{"\xc2\xa1\\u{6666}"})) } + assert_raise(SyntaxError) { eval(e(%{"\xc2\xa1\\u{6666}"})) } + assert_raise(SyntaxError) { eval(s(%{"\xc2\xa1\\u{6666}"})) } + assert_nothing_raised { eval(u(%{"\xc2\xa1\\u{6666}"})) } + assert_raise(SyntaxError) { eval(a(%{"\\u{6666}\xc2\xa1"})) } + assert_raise(SyntaxError) { eval(e(%{"\\u{6666}\xc2\xa1"})) } + assert_raise(SyntaxError) { eval(s(%{"\\u{6666}\xc2\xa1"})) } + assert_nothing_raised { eval(u(%{"\\u{6666}\xc2\xa1"})) } + end + + def test_string_inspect_invalid + assert_equal('"\xFE"', e("\xfe").inspect) + assert_equal('"\x8E"', e("\x8e").inspect) + assert_equal('"\x8F"', e("\x8f").inspect) + assert_equal('"\x8F\xA1"', e("\x8f\xa1").inspect) + assert_equal('"\xEF"', s("\xef").inspect) + assert_equal('"\xC2"', u("\xc2").inspect) + assert_equal('"\xE0\x80"', u("\xe0\x80").inspect) + assert_equal('"\xF0\x80\x80"', u("\xf0\x80\x80").inspect) + assert_equal('"\xF8\x80\x80\x80"', u("\xf8\x80\x80\x80").inspect) + assert_equal('"\xFC\x80\x80\x80\x80"', u("\xfc\x80\x80\x80\x80").inspect) + + assert_equal('"\xFE "', e("\xfe ").inspect) + assert_equal('"\x8E "', e("\x8e ").inspect) + assert_equal('"\x8F "', e("\x8f ").inspect) + assert_equal('"\x8F\xA1 "', e("\x8f\xa1 ").inspect) + assert_equal('"\xEF "', s("\xef ").inspect) + assert_equal('"\xC2 "', u("\xc2 ").inspect) + assert_equal('"\xE0\x80 "', u("\xe0\x80 ").inspect) + assert_equal('"\xF0\x80\x80 "', u("\xf0\x80\x80 ").inspect) + assert_equal('"\xF8\x80\x80\x80 "', u("\xf8\x80\x80\x80 ").inspect) + assert_equal('"\xFC\x80\x80\x80\x80 "', u("\xfc\x80\x80\x80\x80 ").inspect) + + assert_equal('"\x81."', s("\x81.").inspect) + assert_equal('"\xFC"', u("\xfc").inspect) + end + + def test_string_inspect_encoding + [ + Encoding::UTF_8, + Encoding::EUC_JP, + Encoding::Windows_31J, + Encoding::GB18030, + ].each do |e| + EnvUtil.with_default_external(e) do + str = "\x81\x30\x81\x30".force_encoding('GB18030') + assert_equal(Encoding::GB18030 == e ? %{"#{str}"} : '"\x{81308130}"', str.inspect) + str = e("\xa1\x8f\xa1\xa1") + expected = "\"\\xA1\x8F\xA1\xA1\"".force_encoding("EUC-JP") + assert_equal(Encoding::EUC_JP == e ? expected : "\"\\xA1\\x{8FA1A1}\"", str.inspect) + str = s("\x81@") + assert_equal(Encoding::Windows_31J == e ? %{"#{str}"} : '"\x{8140}"', str.inspect) + str = "\u3042\u{10FFFD}" + assert_equal(Encoding::UTF_8 == e ? %{"#{str}"} : '"\u3042\u{10FFFD}"', str.inspect) + end + end + + EnvUtil.with_default_external(Encoding::UTF_8) do + [ + Encoding::UTF_16BE, + Encoding::UTF_16LE, + Encoding::UTF_32BE, + Encoding::UTF_32LE, + Encoding::UTF8_SOFTBANK + ].each do |e| + str = "abc".encode(e) + assert_equal('"abc"', str.inspect) + end + end + end + + def test_utf_dummy_are_like_regular_dummy_encodings + [Encoding::UTF_16, Encoding::UTF_32].each do |enc| + s = "\u3042".encode("UTF-32BE") + assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + s = "\x00\x00\xFE\xFF" + assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + + assert_equal [0, 0, 254, 255], "\x00\x00\xFE\xFF".force_encoding(enc).codepoints + assert_equal 0, "\x00\x00\xFE\xFF".force_encoding(enc).ord + assert_equal 255, "\xFF\xFE\x00\x00".force_encoding(enc).ord + end + end + + def test_utf_without_bom_asciionly + bug10598 = '[ruby-core:66835] [Bug #10598]' + encs = [Encoding::UTF_16, Encoding::UTF_32].find_all {|enc| + "abcd".force_encoding(enc).ascii_only? + } + assert_empty(encs, bug10598) + end + + def test_utf_without_bom_valid + encs = [Encoding::UTF_16, Encoding::UTF_32].find_all {|enc| + !(+"abcd").encode!(enc).force_encoding(enc).valid_encoding? + } + assert_empty(encs) + end + + def test_object_utf16_32_inspect + EnvUtil.with_default_external(Encoding::UTF_8) do + o = Object.new + [Encoding::UTF_16BE, Encoding::UTF_16LE, Encoding::UTF_32BE, Encoding::UTF_32LE].each do |e| + o.instance_eval "undef inspect;def inspect;'abc'.encode('#{e}');end" + assert_equal '[abc]', [o].inspect + end + end + end + + def test_object_inspect_external + omit "https://bugs.ruby-lang.org/issues/18338" + + o = Object.new + + EnvUtil.with_default_external(Encoding::UTF_16BE) do + def o.inspect + "abc" + end + assert_nothing_raised(Encoding::CompatibilityError) { [o].inspect } + + def o.inspect + "abc".encode(Encoding.default_external) + end + assert_equal '[abc]', [o].inspect + end + + EnvUtil.with_default_external(Encoding::US_ASCII) do + def o.inspect + "\u3042" + end + assert_equal '[\u3042]', [o].inspect + + def o.inspect + "\x82\xa0".force_encoding(Encoding::Windows_31J) + end + assert_equal '[\x{82A0}]', [o].inspect + end + end + + def test_str_dump + [ + e("\xfe"), + e("\x8e"), + e("\x8f"), + e("\x8f\xa1"), + s("\xef"), + u("\xc2"), + u("\xe0\x80"), + u("\xf0\x80\x80"), + u("\xf8\x80\x80\x80"), + u("\xfc\x80\x80\x80\x80"), + + e("\xfe "), + e("\x8e "), + e("\x8f "), + e("\x8f\xa1 "), + s("\xef "), + u("\xc2 "), + u("\xe0\x80 "), + u("\xf0\x80\x80 "), + u("\xf8\x80\x80\x80 "), + u("\xfc\x80\x80\x80\x80 "), + + + e("\xa1\x8f\xa1\xa1"), + + s("\x81."), + s("\x81@"), + + u("\xfc"), + "\u3042", + "ascii", + + "\u3042".encode("UTF-16LE"), + "\u3042".encode("UTF-16BE"), + ].each do |str| + dump = str.dump + assert_equal(str, eval(dump), "[ruby-dev:33142]") + assert_equal(str, dump.undump) + assert_equal(str, eval("# frozen-string-literal: true\n#{dump}"), '[Bug #14687]') + end + end + + def test_validate_redundant_utf8 + bits_0x10ffff = "11110100 10001111 10111111 10111111" + [ + "0xxxxxxx", + "110XXXXx 10xxxxxx", + "1110XXXX 10Xxxxxx 10xxxxxx", + "11110XXX 10XXxxxx 10xxxxxx 10xxxxxx", + "111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "11111110 10XXXXXx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + "11111111 10XXXXXX 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx", + ].each {|pat0| + [ + pat0.gsub(/x/, '1'), + pat0.gsub(/x/, '0') + ].each {|pat1| + [ + pat1.sub(/X([^X]*)\z/, '1\1').gsub(/X/, "0"), + pat1.gsub(/X/, "1"), + ].each {|pat2| + s = [pat2.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + if pat2 <= bits_0x10ffff + assert_predicate(s, :valid_encoding?, "#{pat2}") + else + assert_not_predicate(s, :valid_encoding?, "#{pat2}") + end + } + if / / =~ pat0 + pat3 = pat1.gsub(/X/, "0") + s = [pat3.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert_not_predicate(s, :valid_encoding?, "#{pat3}") + end + } + } + end + + def test_validate_surrogate + # 1110XXXX 10Xxxxxx 10xxxxxx : 3 bytes UTF-8 + pats = [ + "11101101 10011111 10111111", # just before surrogate high + "11101101 1010xxxx 10xxxxxx", # surrogate high + "11101101 1011xxxx 10xxxxxx", # surrogate low + "11101110 10000000 10000000", # just after surrogate low + ] + pats.values_at(1,2).each {|pat0| + [ + pat0.gsub(/x/, '0'), + pat0.gsub(/x/, '1'), + ].each {|pat1| + s = [pat1.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert_not_predicate(s, :valid_encoding?, "#{pat1}") + } + } + pats.values_at(0,3).each {|pat| + s = [pat.gsub(/ /, "")].pack("B*").force_encoding("utf-8") + assert_predicate(s, :valid_encoding?, "#{pat}") + } + end + + def test_regexp_too_short_multibyte_character + assert_raise(SyntaxError) { eval('/\xfe/e') } + assert_raise(SyntaxError) { eval('/\x8e/e') } + assert_raise(SyntaxError) { eval('/\x8f/e') } + assert_raise(SyntaxError) { eval('/\x8f\xa1/e') } + assert_raise(SyntaxError) { eval('/\xef/s') } + assert_raise(SyntaxError) { eval('/\xc2/u') } + assert_raise(SyntaxError) { eval('/\xe0\x80/u') } + assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } + + # raw 8bit + assert_raise(SyntaxError) { eval("/\xfe/e") } + assert_raise(SyntaxError) { eval("/\xc2/u") } + + # invalid suffix + assert_raise(SyntaxError) { eval('/\xc2\xff/u') } + assert_raise(SyntaxError) { eval('/\xc2 /u') } + assert_raise(SyntaxError) { eval('/\xc2\x20/u') } + end + + def test_regexp_generic + assert_regexp_generic_ascii(/a/) + assert_regexp_generic_ascii(Regexp.new(a("a"))) + assert_regexp_generic_ascii(Regexp.new(e("a"))) + assert_regexp_generic_ascii(Regexp.new(s("a"))) + assert_regexp_generic_ascii(Regexp.new(u("a"))) + + [/a/, Regexp.new(a("a"))].each {|r| + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_equal(nil, r =~ a("\xc2\xa1")) + assert_equal(nil, r =~ e("\xc2\xa1")) + assert_equal(nil, r =~ s("\xc2\xa1")) + assert_equal(nil, r =~ u("\xc2\xa1")) + } + end + + def test_regexp_ascii_none + r = /a/n + + assert_warning(%r{historical binary regexp match /\.\.\./n against}) { + assert_regexp_generic_ascii(r) + } + + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_equal(nil, r =~ a("\xc2\xa1")) + assert_warning(%r{historical binary regexp match /\.\.\./n against EUC-JP string}) { + assert_equal(nil, r =~ e("\xc2\xa1")) + } + assert_warning(%r{historical binary regexp match /\.\.\./n against Windows-31J string}) { + assert_equal(nil, r =~ s("\xc2\xa1")) + } + assert_warning(%r{historical binary regexp match /\.\.\./n against UTF-8 string}) { + assert_equal(nil, r =~ u("\xc2\xa1")) + } + + assert_nothing_raised { eval(e("/\\x80/n")) } + end + + def test_regexp_ascii + assert_regexp_fixed_ascii8bit(/\xc2\xa1/n) + assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/}))) + assert_regexp_fixed_ascii8bit(eval(a(%{/\xc2\xa1/n}))) + assert_regexp_fixed_ascii8bit(eval(a(%q{/\xc2\xa1/}))) + + s = '\xc2\xa1' + assert_regexp_fixed_ascii8bit(/#{s}/) + + assert_raise(SyntaxError) { eval("/\xa1\xa1/n".force_encoding("euc-jp")) } + + [/\xc2\xa1/n, eval(a(%{/\xc2\xa1/})), eval(a(%{/\xc2\xa1/n}))].each {|r| + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc2\xa1")) + assert_raise(Encoding::CompatibilityError) { r =~ e("\xc2\xa1") } + assert_raise(Encoding::CompatibilityError) { r =~ s("\xc2\xa1") } + assert_raise(Encoding::CompatibilityError) { r =~ u("\xc2\xa1") } + } + end + + def test_regexp_euc + assert_regexp_fixed_eucjp(/a/e) + assert_regexp_fixed_eucjp(/\xc2\xa1/e) + assert_regexp_fixed_eucjp(eval(e(%{/\xc2\xa1/}))) + assert_regexp_fixed_eucjp(eval(e(%q{/\xc2\xa1/}))) + + [/a/e].each {|r| + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_raise(Encoding::CompatibilityError) { r =~ a("\xc2\xa1") } + assert_equal(nil, r =~ e("\xc2\xa1")) + assert_raise(Encoding::CompatibilityError) { r =~ s("\xc2\xa1") } + assert_raise(Encoding::CompatibilityError) { r =~ u("\xc2\xa1") } + } + + [/\xc2\xa1/e, eval(e(%{/\xc2\xa1/})), eval(e(%q{/\xc2\xa1/}))].each {|r| + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_raise(Encoding::CompatibilityError) { r =~ a("\xc2\xa1") } + assert_equal(0, r =~ e("\xc2\xa1")) + assert_raise(Encoding::CompatibilityError) { r =~ s("\xc2\xa1") } + assert_raise(Encoding::CompatibilityError) { r =~ u("\xc2\xa1") } + } + end + + def test_regexp_sjis + assert_regexp_fixed_sjis(/a/s) + assert_regexp_fixed_sjis(/\xc2\xa1/s) + assert_regexp_fixed_sjis(eval(s(%{/\xc2\xa1/}))) + assert_regexp_fixed_sjis(eval(s(%q{/\xc2\xa1/}))) + end + + def test_regexp_windows_31j + begin + Regexp.new("\xa1".force_encoding("windows-31j")) =~ "\xa1\xa1".force_encoding("euc-jp") + rescue Encoding::CompatibilityError + err = $! + end + assert_match(/windows-31j/i, err.message) + end + + def test_regexp_embed + r = eval(e("/\xc2\xa1/")) + assert_raise(RegexpError) { eval(s("/\xc2\xa1\#{r}/s")) } + assert_raise(RegexpError) { eval(s("/\#{r}\xc2\xa1/s")) } + + r = /\xc2\xa1/e + assert_raise(RegexpError) { eval(s("/\xc2\xa1\#{r}/s")) } + assert_raise(RegexpError) { eval(s("/\#{r}\xc2\xa1/s")) } + + r = eval(e("/\xc2\xa1/")) + assert_raise(RegexpError) { /\xc2\xa1#{r}/s } + + r = /\xc2\xa1/e + assert_raise(RegexpError) { /\xc2\xa1#{r}/s } + + r1 = Regexp.new('foo'.force_encoding("ascii-8bit")) + r2 = eval('/bar#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::US_ASCII, r2.encoding) + + r1 = Regexp.new('foo'.force_encoding("us-ascii")) + r2 = eval('/bar#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::US_ASCII, r2.encoding) + + r1 = Regexp.new('foo'.force_encoding("ascii-8bit")) + r2 = eval('/bar#{r1}/'.force_encoding('us-ascii')) + assert_equal(Encoding::US_ASCII, r2.encoding) + + r1 = Regexp.new('foo'.force_encoding("us-ascii")) + r2 = eval('/bar#{r1}/'.force_encoding('us-ascii')) + assert_equal(Encoding::US_ASCII, r2.encoding) + + r1 = Regexp.new('\xa1'.force_encoding("ascii-8bit")) + r2 = eval('/bar#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::ASCII_8BIT, r2.encoding) + + r1 = Regexp.new('\xa1'.force_encoding("ascii-8bit")) + r2 = eval('/bar#{r1}/'.force_encoding('us-ascii')) + assert_equal(Encoding::ASCII_8BIT, r2.encoding) + + r1 = Regexp.new('foo'.force_encoding("ascii-8bit")) + r2 = eval('/\xa1#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::ASCII_8BIT, r2.encoding) + + r1 = Regexp.new('foo'.force_encoding("us-ascii")) + r2 = eval('/\xa1#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::ASCII_8BIT, r2.encoding) + + r1 = Regexp.new('\xa1'.force_encoding("ascii-8bit")) + r2 = eval('/\xa1#{r1}/'.force_encoding('ascii-8bit')) + assert_equal(Encoding::ASCII_8BIT, r2.encoding) + + [r1, r2] + end + + def test_regexp_named_class + assert_match(/[[:space:]]/u, "\u{00a0}") + assert_match(/[[:space:]]/, "\u{00a0}") + end + + def test_regexp_property + s = '\p{Hiragana}'.force_encoding("euc-jp") + assert_equal(Encoding::EUC_JP, s.encoding) + r = nil + assert_nothing_raised { + r = Regexp.new(s) + } + assert_predicate(r, :fixed_encoding?) + assert_match(r, "\xa4\xa2".force_encoding("euc-jp")) + + r = eval('/\p{Hiragana}/'.force_encoding("euc-jp")) + assert_predicate(r, :fixed_encoding?) + assert_match(r, "\xa4\xa2".force_encoding("euc-jp")) + + r = /\p{Hiragana}/e + assert_predicate(r, :fixed_encoding?) + assert_match(r, "\xa4\xa2".force_encoding("euc-jp")) + + r = /\p{AsciI}/e + assert_predicate(r, :fixed_encoding?) + assert_match(r, "a".force_encoding("euc-jp")) + + r = /\p{hiraganA}/e + assert_predicate(r, :fixed_encoding?) + assert_match(r, "\xa4\xa2".force_encoding("euc-jp")) + + r = eval('/\u{3042}\p{Hiragana}/'.force_encoding("euc-jp")) + assert_predicate(r, :fixed_encoding?) + assert_equal(Encoding::UTF_8, r.encoding) + + r = eval('/\p{Hiragana}\u{3042}/'.force_encoding("euc-jp")) + assert_predicate(r, :fixed_encoding?) + assert_equal(Encoding::UTF_8, r.encoding) + end + + def test_regexp_embed_preprocess + r1 = /\xa4\xa2/e + r2 = /#{r1}/ + assert_include(r2.source, r1.source) + end + + def test_begin_end_offset + str = e("\244\242\244\244\244\246\244\250\244\252a") + assert(/(a)/ =~ str) + assert_equal("a", $&) + assert_equal(5, $~.begin(0)) + assert_equal(6, $~.end(0)) + assert_equal([5,6], $~.offset(0)) + assert_equal(5, $~.begin(1)) + assert_equal(6, $~.end(1)) + assert_equal([5,6], $~.offset(1)) + end + + def test_begin_end_offset_sjis + str = s("\x81@@") + assert(/@/ =~ str) + assert_equal(s("\x81@"), $`) + assert_equal("@", $&) + assert_equal("", $') + assert_equal([1,2], $~.offset(0)) + end + + def test_quote + assert_regexp_generic_ascii(/#{Regexp.quote(a("a"))}#{Regexp.quote(e("e"))}/) + + assert_encoding("US-ASCII", Regexp.quote(a("")).encoding) + assert_encoding("US-ASCII", Regexp.quote(e("")).encoding) + assert_encoding("US-ASCII", Regexp.quote(s("")).encoding) + assert_encoding("US-ASCII", Regexp.quote(u("")).encoding) + assert_encoding("US-ASCII", Regexp.quote(a("a")).encoding) + assert_encoding("US-ASCII", Regexp.quote(e("a")).encoding) + assert_encoding("US-ASCII", Regexp.quote(s("a")).encoding) + assert_encoding("US-ASCII", Regexp.quote(u("a")).encoding) + + assert_encoding("ASCII-8BIT", Regexp.quote(a("\xc2\xa1")).encoding) + assert_encoding("EUC-JP", Regexp.quote(e("\xc2\xa1")).encoding) + assert_encoding("Windows-31J", Regexp.quote(s("\xc2\xa1")).encoding) + assert_encoding("UTF-8", Regexp.quote(u("\xc2\xa1")).encoding) + end + + def test_union_0 + r = Regexp.union + assert_regexp_generic_ascii(r) + assert_not_match(r, a("")) + assert_not_match(r, e("")) + assert_not_match(r, s("")) + assert_not_match(r, u("")) + end + + def test_union_1_asciionly_string + assert_regexp_generic_ascii(Regexp.union(a(""))) + assert_regexp_generic_ascii(Regexp.union(e(""))) + assert_regexp_generic_ascii(Regexp.union(s(""))) + assert_regexp_generic_ascii(Regexp.union(u(""))) + assert_regexp_generic_ascii(Regexp.union(a("a"))) + assert_regexp_generic_ascii(Regexp.union(e("a"))) + assert_regexp_generic_ascii(Regexp.union(s("a"))) + assert_regexp_generic_ascii(Regexp.union(u("a"))) + assert_regexp_generic_ascii(Regexp.union(a("\t"))) + assert_regexp_generic_ascii(Regexp.union(e("\t"))) + assert_regexp_generic_ascii(Regexp.union(s("\t"))) + assert_regexp_generic_ascii(Regexp.union(u("\t"))) + end + + def test_union_1_nonascii_string + assert_regexp_fixed_ascii8bit(Regexp.union(a("\xc2\xa1"))) + assert_regexp_fixed_eucjp(Regexp.union(e("\xc2\xa1"))) + assert_regexp_fixed_sjis(Regexp.union(s("\xc2\xa1"))) + assert_regexp_fixed_utf8(Regexp.union(u("\xc2\xa1"))) + end + + def test_union_1_regexp + assert_regexp_generic_ascii(Regexp.union(//)) + assert_warning(%r{historical binary regexp match /.../n against}) { + assert_regexp_generic_ascii(Regexp.union(//n)) + } + assert_regexp_fixed_eucjp(Regexp.union(//e)) + assert_regexp_fixed_sjis(Regexp.union(//s)) + assert_regexp_fixed_utf8(Regexp.union(//u)) + end + + def test_union_2 + ary = [ + a(""), e(""), s(""), u(""), + a("\xc2\xa1"), e("\xc2\xa1"), s("\xc2\xa1"), u("\xc2\xa1") + ] + ary.each {|s1| + ary.each {|s2| + if s1.empty? + if s2.empty? + assert_regexp_generic_ascii(Regexp.union(s1, s2)) + else + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s2.encoding, r.encoding) + end + else + if s2.empty? + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + if s1.encoding == s2.encoding + r = Regexp.union(s1, s2) + assert_regexp_fixed_encoding(r) + assert_equal(s1.encoding, r.encoding) + else + assert_raise(ArgumentError) { Regexp.union(s1, s2) } + end + end + end + } + } + end + + def test_dynamic_ascii_regexp + assert_warning(%r{historical binary regexp match /.../n against}) { + assert_regexp_generic_ascii(/#{ }/n) + } + assert_regexp_fixed_ascii8bit(/#{ }\xc2\xa1/n) + assert_regexp_fixed_ascii8bit(/\xc2\xa1#{ }/n) + assert_nothing_raised { s1, s2 = a('\xc2'), a('\xa1'); /#{s1}#{s2}/ } + end + + def test_dynamic_eucjp_regexp + assert_regexp_fixed_eucjp(/#{ }/e) + assert_regexp_fixed_eucjp(/#{ }\xc2\xa1/e) + assert_regexp_fixed_eucjp(/\xc2\xa1#{ }/e) + assert_raise(SyntaxError) { eval('/\xc2#{ }/e') } + assert_raise(SyntaxError) { eval('/#{ }\xc2/e') } + assert_raise(SyntaxError) { eval('/\xc2#{ }\xa1/e') } + assert_raise(ArgumentError) { s1, s2 = e('\xc2'), e('\xa1'); /#{s1}#{s2}/ } + end + + def test_dynamic_sjis_regexp + assert_regexp_fixed_sjis(/#{ }/s) + assert_regexp_fixed_sjis(/#{ }\xc2\xa1/s) + assert_regexp_fixed_sjis(/\xc2\xa1#{ }/s) + assert_raise(SyntaxError) { eval('/\x81#{ }/s') } + assert_raise(SyntaxError) { eval('/#{ }\x81/s') } + assert_raise(SyntaxError) { eval('/\x81#{ }\xa1/s') } + assert_raise(ArgumentError) { s1, s2 = s('\x81'), s('\xa1'); /#{s1}#{s2}/ } + end + + def test_dynamic_utf8_regexp + assert_regexp_fixed_utf8(/#{ }/u) + assert_regexp_fixed_utf8(/#{ }\xc2\xa1/u) + assert_regexp_fixed_utf8(/\xc2\xa1#{ }/u) + assert_raise(SyntaxError) { eval('/\xc2#{ }/u') } + assert_raise(SyntaxError) { eval('/#{ }\xc2/u') } + assert_raise(SyntaxError) { eval('/\xc2#{ }\xa1/u') } + assert_raise(ArgumentError) { s1, s2 = u('\xc2'), u('\xa1'); /#{s1}#{s2}/ } + end + + def test_regexp_unicode + assert_nothing_raised { eval '/\u{0}/u' } + assert_nothing_raised { eval '/\u{D7FF}/u' } + assert_raise(SyntaxError) { eval '/\u{D800}/u' } + assert_raise(SyntaxError) { eval '/\u{DFFF}/u' } + assert_nothing_raised { eval '/\u{E000}/u' } + assert_nothing_raised { eval '/\u{10FFFF}/u' } + assert_raise(SyntaxError) { eval '/\u{110000}/u' } + end + + def test_regexp_mixed_unicode + assert_raise(SyntaxError) { eval(a(%{/\xc2\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\xc2\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\xc2\xa1\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\xc2\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\xc2\xa1/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\xc2\xa1/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\xc2\xa1/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}\xc2\xa1/})) } + + assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa1\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\\xc2\\xa1\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}\\xc2\\xa1/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}\\xc2\\xa1/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}\\xc2\\xa1/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}\\xc2\\xa1/})) } + + assert_raise(SyntaxError) { eval(a(%{/\xc2\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\xc2\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\xc2\xa1#{ }\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\xc2\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{ }\xc2\xa1/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{ }\xc2\xa1/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{ }\xc2\xa1/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}#{ }\xc2\xa1/})) } + + assert_raise(SyntaxError) { eval(a(%{/\\xc2\\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(e(%{/\\xc2\\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(s(%{/\\xc2\\xa1#{ }\\u{6666}/})) } + assert_nothing_raised { eval(u(%{/\\xc2\\xa1#{ }\\u{6666}/})) } + assert_raise(SyntaxError) { eval(a(%{/\\u{6666}#{ }\\xc2\\xa1/})) } + assert_raise(SyntaxError) { eval(e(%{/\\u{6666}#{ }\\xc2\\xa1/})) } + assert_raise(SyntaxError) { eval(s(%{/\\u{6666}#{ }\\xc2\\xa1/})) } + assert_nothing_raised { eval(u(%{/\\u{6666}#{ }\\xc2\\xa1/})) } + end + + def test_str_allocate + s = String.allocate + assert_equal(Encoding::ASCII_8BIT, s.encoding) + end + + def test_str_String + s = String(10) + assert_equal(Encoding::US_ASCII, s.encoding) + end + + def test_sprintf_c + assert_strenc("\x80", 'ASCII-8BIT', a("%c") % 128) + #assert_raise(ArgumentError) { a("%c") % 0xc2a1 } + assert_strenc("\xc2\xa1", 'EUC-JP', e("%c") % 0xc2a1) + assert_raise(ArgumentError) { e("%c") % 0xc2 } + assert_strenc("\xc2", 'Windows-31J', s("%c") % 0xc2) + #assert_raise(ArgumentError) { s("%c") % 0xc2a1 } + assert_strenc("\u{c2a1}", 'UTF-8', u("%c") % 0xc2a1) + assert_strenc("\u{c2}", 'UTF-8', u("%c") % 0xc2) + assert_raise(Encoding::CompatibilityError) { + "%s%s" % [s("\xc2\xa1"), e("\xc2\xa1")] + } + + assert_equal("\u3042".encode('Windows-31J'), "%c" % "\u3042\u3044".encode('Windows-31J')) + end + + def test_sprintf_p + Encoding.list.each do |e| + unless e.ascii_compatible? + format = e.dummy? ? "%p".force_encoding(e) : "%p".encode(e) + assert_raise(Encoding::CompatibilityError) do + sprintf(format, nil) + end + assert_raise(Encoding::CompatibilityError) do + format % nil + end + next + end + format = "%p".force_encoding(e) + ['', 'a', "\xC2\xA1", "\x00"].each do |s| + s.force_encoding(e) + enc = (''.force_encoding(e) + s.inspect).encoding + assert_strenc(s.inspect, enc, format % s) + end + s = "\xC2\xA1".force_encoding(e) + enc = ('' + s.inspect).encoding + assert_strenc('%10s' % s.inspect, enc, "%10p" % s) + end + end + + def test_sprintf_s + assert_strenc('', 'ASCII-8BIT', a("%s") % a("")) + assert_strenc('', 'EUC-JP', e("%s") % e("")) + assert_strenc('', 'Windows-31J', s("%s") % s("")) + assert_strenc('', 'UTF-8', u("%s") % u("")) + + assert_strenc('a', 'ASCII-8BIT', a("%s") % a("a")) + assert_strenc('a', 'EUC-JP', e("%s") % e("a")) + assert_strenc('a', 'Windows-31J', s("%s") % s("a")) + assert_strenc('a', 'UTF-8', u("%s") % u("a")) + + assert_strenc("\xC2\xA1", 'ASCII-8BIT', a("%s") % a("\xc2\xa1")) + assert_strenc("\xC2\xA1", 'EUC-JP', e("%s") % e("\xc2\xa1")) + #assert_strenc("\xC2\xA1", 'Windows-31J', s("%s") % s("\xc2\xa1")) + assert_strenc("\xC2\xA1", 'UTF-8', u("%s") % u("\xc2\xa1")) + + assert_strenc(" \xC2\xA1", 'ASCII-8BIT', "%10s" % a("\xc2\xa1")) + assert_strenc(" \xA1\xA1", 'EUC-JP', "%10s" % e("\xa1\xa1")) + #assert_strenc(" \xC2\xA1", 'Windows-31J', "%10s" % s("\xc2\xa1")) + assert_strenc(" \xC2\xA1", 'UTF-8', "%10s" % u("\xc2\xa1")) + + assert_strenc("\x00", 'ASCII-8BIT', a("%s") % a("\x00")) + assert_strenc("\x00", 'EUC-JP', e("%s") % e("\x00")) + assert_strenc("\x00", 'Windows-31J', s("%s") % s("\x00")) + assert_strenc("\x00", 'UTF-8', u("%s") % u("\x00")) + assert_equal("EUC-JP", (e("\xc2\xa1 %s") % "foo").encoding.name) + end + + def test_str_lt + assert_operator(a("a"), :<, a("\xa1")) + assert_operator(a("a"), :<, s("\xa1")) + assert_operator(s("a"), :<, a("\xa1")) + end + + def test_str_multiply + str = "\u3042" + assert_equal(true, (str * 0).ascii_only?, "[ruby-dev:33895]") + assert_equal(false, (str * 1).ascii_only?) + assert_equal(false, (str * 2).ascii_only?) + end + + def test_str_aref + assert_equal(a("\xc2"), a("\xc2\xa1")[0]) + assert_equal(a("\xa1"), a("\xc2\xa1")[1]) + assert_equal(nil, a("\xc2\xa1")[2]) + assert_equal(e("\xc2\xa1"), e("\xc2\xa1")[0]) + assert_equal(nil, e("\xc2\xa1")[1]) + assert_equal(s("\xc2"), s("\xc2\xa1")[0]) + assert_equal(s("\xa1"), s("\xc2\xa1")[1]) + assert_equal(nil, s("\xc2\xa1")[2]) + assert_equal(u("\xc2\xa1"), u("\xc2\xa1")[0]) + assert_equal(nil, u("\xc2\xa1")[1]) + + str = "\u3042" + assert_equal(true, str[0, 0].ascii_only?, "[ruby-dev:33895]") + assert_equal(false, str[0, 1].ascii_only?) + assert_equal(false, str[0..-1].ascii_only?) + end + + def test_utf8str_aref + s = "abcdefghijklmnopqrstuvwxyz\u{3042 3044 3046 3048 304A}" + assert_equal("a", s[0]) + assert_equal("h", s[7]) + assert_equal("i", s[8]) + assert_equal("j", s[9]) + assert_equal("\u{3044}", s[27]) + assert_equal("\u{3046}", s[28]) + assert_equal("\u{3048}", s[29]) + s = "abcdefghijklmnopqrstuvw\u{3042 3044 3046 3048 304A}" + assert_equal("\u{3044}", s[24]) + end + + def test_str_aref_len + assert_equal(a("\xa1"), a("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 1]) + assert_equal(a("\xa1\xc2"), a("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 2]) + + assert_equal(e("\xc2\xa2"), e("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 1]) + assert_equal(e("\xc2\xa2\xc2\xa3"), e("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 2]) + + assert_equal(s("\xa1"), s("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 1]) + assert_equal(s("\xa1\xc2"), s("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 2]) + + assert_equal(u("\xc2\xa2"), u("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 1]) + assert_equal(u("\xc2\xa2\xc2\xa3"), u("\xc2\xa1\xc2\xa2\xc2\xa3")[1, 2]) + end + + def test_str_aref_substr + assert_equal(a("\xa1\xc2"), a("\xc2\xa1\xc2\xa2\xc2\xa3")[a("\xa1\xc2")]) + assert_raise(Encoding::CompatibilityError) { a("\xc2\xa1\xc2\xa2\xc2\xa3")[e("\xa1\xc2")] } + + assert_equal(nil, e("\xc2\xa1\xc2\xa2\xc2\xa3")[e("\xa1\xc2")]) + assert_raise(Encoding::CompatibilityError) { e("\xc2\xa1\xc2\xa2\xc2\xa3")[s("\xa1\xc2")] } + + assert_equal(s("\xa1\xc2"), s("\xc2\xa1\xc2\xa2\xc2\xa3")[s("\xa1\xc2")]) + assert_raise(Encoding::CompatibilityError) { s("\xc2\xa1\xc2\xa2\xc2\xa3")[u("\xa1\xc2")] } + + assert_equal(nil, u("\xc2\xa1\xc2\xa2\xc2\xa3")[u("\xa1\xc2")]) + assert_raise(Encoding::CompatibilityError) { u("\xc2\xa1\xc2\xa2\xc2\xa3")[a("\xa1\xc2")] } + assert_nil(e("\xa1\xa2\xa3\xa4")[e("\xa2\xa3")]) + + bug2379 = '[ruby-core:26787]' + assert_equal("\u{439}", "\u{439}"[0, 30], bug2379) + assert_equal("\u{439}", "a\u{439}"[1, 30], bug2379) + assert_equal("\u{439}", "a\u{439}bcdefghijklmnop"[1, 1][0, 1], bug2379) + end + + def test_str_aref_force_encoding + bug5836 = '[ruby-core:41896]' + Encoding.list.each do |enc| + next unless enc.ascii_compatible? + s = "abc".force_encoding(enc) + assert_equal("", s[3, 1], bug5836) + end + end + + def test_aset + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_raise(Encoding::CompatibilityError){s["\xb0\xa3"] = "foo"} + + a = ua("a") + a[/a/] = u("") + assert_equal Encoding::US_ASCII, a.encoding + end + + def test_str_center + assert_encoding("EUC-JP", "a".center(5, e("\xa1\xa2")).encoding) + assert_encoding("EUC-JP", e("\xa3\xb0").center(10).encoding) + end + + def test_squeeze + s = e("\xa3\xb0\xa3\xb1\xa3\xb1\xa3\xb3\xa3\xb4") + assert_equal(e("\xa3\xb0\xa3\xb1\xa3\xb3\xa3\xb4"), s.squeeze) + end + + def test_tr + s = s("\x81\x41") + assert_equal(s.tr("A", "B"), s) + assert_equal(s.tr_s("A", "B"), s) + + assert_nothing_raised { + "a".force_encoding("ASCII-8BIT").tr(a("a"), a("a")) + } + + assert_equal(e("\xA1\xA1"), a("a").tr(a("a"), e("\xA1\xA1"))) + + assert_equal("X\u3042\u3044X", "A\u3042\u3044\u3046".tr("^\u3042\u3044", "X")) + assert_equal("\u3042\u3046" * 100, ("\u3042\u3044" * 100).tr("\u3044", "\u3046")) + assert_equal("Y", "\u3042".tr("^X", "Y")) + end + + def test_tr_s + assert_equal("\xA1\xA1".force_encoding("EUC-JP"), + "a".force_encoding("ASCII-8BIT").tr("a".force_encoding("ASCII-8BIT"), "\xA1\xA1".force_encoding("EUC-JP"))) + end + + def test_count + assert_equal(0, e("\xa1\xa2").count("z")) + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_raise(Encoding::CompatibilityError){s.count(a("\xa3\xb0"))} + end + + def test_count_sjis_trailing_byte + bug10078 = '[ruby-dev:48442] [Bug #10078]' + assert_equal(0, s("\x98\x61").count("a"), bug10078) + end + + def test_delete + assert_equal(1, e("\xa1\xa2").delete("z").length) + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_raise(Encoding::CompatibilityError){s.delete(a("\xa3\xb2"))} + + a = "\u3042\u3044\u3046\u3042\u3044\u3046" + a.delete!("\u3042\u3044", "^\u3044") + assert_equal("\u3044\u3046\u3044\u3046", a) + end + + def test_include? + assert_equal(false, e("\xa1\xa2\xa3\xa4").include?(e("\xa3"))) + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_equal(false, s.include?(e("\xb0\xa3"))) + bug11488 = '[ruby-core:70592] [Bug #11488]' + each_encoding("abcdef", "def") do |str, substr| + assert_equal(true, str.include?(substr), bug11488) + end + end + + def test_index + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_nil(s.index(e("\xb3\xa3"))) + assert_nil(e("\xa1\xa2\xa3\xa4").index(e("\xa3"))) + assert_nil(e("\xa1\xa2\xa3\xa4").rindex(e("\xa3"))) + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + + a_with_e = /EUC-JP and BINARY \(ASCII-8BIT\)/ + assert_raise_with_message(Encoding::CompatibilityError, a_with_e) do + s.index(a("\xb1\xa3")) + end + assert_raise_with_message(Encoding::CompatibilityError, a_with_e) do + s.rindex(a("\xb1\xa3")) + end + + a_with_e = /BINARY \(ASCII-8BIT\) regexp with EUC-JP string/ + assert_raise_with_message(Encoding::CompatibilityError, a_with_e) do + s.index(Regexp.new(a("\xb1\xa3"))) + end + assert_raise_with_message(Encoding::CompatibilityError, a_with_e) do + s.rindex(Regexp.new(a("\xb1\xa3"))) + end + + bug11488 = '[ruby-core:70592] [Bug #11488]' + each_encoding("abcdef", "def") do |str, substr| + assert_equal(3, str.index(substr), bug11488) + end + end + + def test_next + s1 = e("\xa1\xa1") + s2 = s1.dup + (94*94+94).times { s2.next! } + assert_not_equal(s1, s2) + end + + def test_sub + s = "abc".sub(/b/, "\xa1\xa1".force_encoding("euc-jp")) + assert_encoding("EUC-JP", s.encoding) + assert_equal(Encoding::EUC_JP, "\xa4\xa2".force_encoding("euc-jp").sub(/./, '\&').encoding) + assert_equal(Encoding::EUC_JP, "\xa4\xa2".force_encoding("euc-jp").gsub(/./, '\&').encoding) + end + + def test_sub2 + s = "\x80".force_encoding("ASCII-8BIT") + r = Regexp.new("\x80".force_encoding("ASCII-8BIT")) + s2 = s.sub(r, "") + assert_empty(s2) + assert_predicate(s2, :ascii_only?) + end + + def test_sub3 + repl = "\x81".force_encoding("sjis") + assert_equal(false, repl.valid_encoding?) + s = "a@".sub(/a/, repl) + assert_predicate(s, :valid_encoding?) + end + + def test_insert + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_equal(e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4a"), s.insert(-1, "a")) + end + + def test_scan + assert_equal(["a"], e("\xa1\xa2a\xa3\xa4").scan(/a/)) + end + + def test_dup_scan + s1 = e("\xa4\xa2")*100 + s2 = s1.b + s2.scan(/\A./n) {|f| + assert_equal(Encoding::ASCII_8BIT, f.encoding) + } + end + + def test_dup_aref + s1 = e("\xa4\xa2")*100 + s2 = s1.b + assert_equal(Encoding::ASCII_8BIT, s2[10..-1].encoding) + end + + def test_upto + s1 = e("\xa1\xa2") + s2 = s("\xa1\xa2") + assert_raise(Encoding::CompatibilityError){s1.upto(s2) {|x| break }} + end + + def test_casecmp + s1 = s("\x81\x41") + s2 = s("\x81\x61") + assert_not_equal(0, s1.casecmp(s2)) + end + + def test_reverse + bug11387 = '[ruby-dev:49189] [Bug #11387]' + s1 = u("abcdefghij\xf0") + s2 = s1.reverse + assert_not_predicate(s1, :valid_encoding?, bug11387) + assert_equal(u("\xf0jihgfedcba"), s2) + assert_not_predicate(s2, :valid_encoding?, bug11387) + end + + def test_reverse_bang + s = u("abcdefghij\xf0") + s.reverse! + assert_equal(u("\xf0jihgfedcba"), s) + end + + def test_plus + assert_raise(Encoding::CompatibilityError){u("\xe3\x81\x82") + a("\xa1")} + end + + def test_chomp + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_raise(Encoding::CompatibilityError){s.chomp(s("\xa3\xb4"))} + end + + def test_gsub + s = 'abc' + s.ascii_only? + s.gsub!(/b/, "\x80") + assert_equal(false, s.ascii_only?, "[ruby-core:14566] reported by Sam Ruby") + + s = "abc".force_encoding(Encoding::ASCII_8BIT) + assert_equal(Encoding::ASCII_8BIT, s.encoding) + + assert_raise(Encoding::CompatibilityError) { + "abc".gsub(/[ac]/) { + $& == "a" ? "\xc2\xa1".force_encoding("euc-jp") : + "\xc2\xa1".force_encoding("utf-8") + } + } + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_equal(e("\xa3\xb0z\xa3\xb2\xa3\xb3\xa3\xb4"), s.gsub(/\xa3\xb1/e, "z")) + + assert_equal(Encoding::ASCII_8BIT, (a("").gsub(//) { e("") }.encoding)) + assert_equal(Encoding::ASCII_8BIT, (a("a").gsub(/a/) { e("") }.encoding)) + end + + def test_end_with + s1 = s("\x81\x40") + s2 = "@" + assert_equal(false, s1.end_with?(s2), "#{encdump s1}.end_with?(#{encdump s2})") + each_encoding("\u3042\u3044", "\u3044") do |_s1, _s2| + assert_equal(true, _s1.end_with?(_s2), "#{encdump _s1}.end_with?(#{encdump _s2})") + end + each_encoding("\u3042a\u3044", "a\u3044") do |_s1, _s2| + assert_equal(true, _s1.end_with?(_s2), "#{encdump _s1}.end_with?(#{encdump _s2})") + end + end + + def test_each_line + s = e("\xa3\xb0\xa3\xb1\xa3\xb2\xa3\xb3\xa3\xb4") + assert_raise(Encoding::CompatibilityError){s.each_line(a("\xa3\xb1")) {|l| }} + s = e("\xa4\xa2\nfoo") + + actual = [] + s.each_line {|line| actual << line } + expected = [e("\xa4\xa2\n"), e("foo")] + assert_equal(expected, actual) + end + + def test_each_char + a = [e("\xa4\xa2"), "b", e("\xa4\xa4"), "c"] + s = "\xa4\xa2b\xa4\xa4c".force_encoding("euc-jp") + assert_equal(a, s.each_char.to_a, "[ruby-dev:33211] #{encdump s}.each_char.to_a") + end + + def test_str_concat + assert_equal(1, "".concat(0xA2).size) + assert_equal(Encoding::ASCII_8BIT, "".force_encoding("US-ASCII").concat(0xA2).encoding) + assert_equal("A\x84\x31\xA4\x39".force_encoding("GB18030"), + "A".force_encoding("GB18030") << 0x8431A439) + end + + def test_regexp_match + assert_equal([0,0], //.match("\xa1\xa1".force_encoding("euc-jp"),-1).offset(0)) + assert_equal(0, // =~ :a) + end + + def test_split + assert_equal(e("\xa1\xa2\xa1\xa3").split(//), + [e("\xa1\xa2"), e("\xa1\xa3")], + '[ruby-dev:32452]') + + each_encoding("abc,def", ",", "abc", "def") do |str, sep, *expected| + assert_equal(expected, str.split(sep, -1)) + end + each_encoding("abc\0def", "\0", "abc", "def") do |str, sep, *expected| + assert_equal(expected, str.split(sep, -1)) + end + end + + def test_nonascii_method_name + eval(e("def \xc2\xa1() @nonascii_method_name = :e end")) + eval(u("def \xc2\xa1() @nonascii_method_name = :u end")) + eval(e("\xc2\xa1()")) + assert_equal(:e, @nonascii_method_name) + eval(u("\xc2\xa1()")) + assert_equal(:u, @nonascii_method_name) + me = method(e("\xc2\xa1")) + mu = method(u("\xc2\xa1")) + assert_not_equal(me.name, mu.name) + assert_not_equal(me.inspect, mu.inspect) + assert_equal(e("\xc2\xa1"), me.name.to_s) + assert_equal(u("\xc2\xa1"), mu.name.to_s) + end + + def test_symbol + s1 = "\xc2\xa1".force_encoding("euc-jp").intern + s2 = "\xc2\xa1".force_encoding("utf-8").intern + assert_not_equal(s1, s2) + end + + def test_symbol_op + ops = %w" + .. ... + - * / % ** +@ -@ | ^ & ! <=> > >= < <= == + === != =~ !~ ~ ! [] []= << >> :: ` + " + ops.each do |op| + assert_equal(Encoding::US_ASCII, op.intern.encoding, "[ruby-dev:33449]") + end + end + + def test_chr + 0.upto(255) {|b| + assert_equal([b].pack("C"), b.chr) + } + assert_equal("\x84\x31\xA4\x39".force_encoding("GB18030"), 0x8431A439.chr("GB18030")) + e = assert_raise(RangeError) { + 2206368128.chr(Encoding::UTF_8) + } + assert_not_match(/-\d+ out of char range/, e.message) + + assert_raise(RangeError){ 0x80.chr("US-ASCII") } + assert_raise(RangeError){ 0x80.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0xE0.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0x100.chr("SHIFT_JIS") } + assert_raise(RangeError){ 0xA0.chr("EUC-JP") } + assert_raise(RangeError){ 0x100.chr("EUC-JP") } + assert_raise(RangeError){ 0xA1A0.chr("EUC-JP") } + end + + def test_marshal + s1 = "\xa1\xa1".force_encoding("euc-jp") + s2 = Marshal.load(Marshal.dump(s1)) + assert_equal(s1, s2) + end + + def test_env + if RUBY_PLATFORM =~ /bccwin|mswin|mingw/ + env_encoding = Encoding::UTF_8 + else + env_encoding = Encoding.find("locale") + end + ENV.each {|k, v| + assert_equal(env_encoding, k.encoding, proc {"key(#{k.encoding})=#{k.dump}"}) + assert_equal(env_encoding, v.encoding, proc {"key(#{k.encoding})=#{k.dump}\n" "value(#{v.encoding})=#{v.dump}"}) + } + end + + def test_empty_string + assert_equal(Encoding::US_ASCII, "".encoding) + end + + def test_nil_to_s + assert_equal(Encoding::US_ASCII, nil.to_s.encoding) + end + + def test_nil_inspect + assert_equal(Encoding::US_ASCII, nil.inspect.encoding) + end + + def test_true_to_s + assert_equal(Encoding::US_ASCII, true.to_s.encoding) + end + + def test_false_to_s + assert_equal(Encoding::US_ASCII, false.to_s.encoding) + end + + def test_fixnum_to_s + assert_equal(Encoding::US_ASCII, 1.to_s.encoding) + end + + def test_float_to_s + assert_equal(Encoding::US_ASCII, 1.0.to_s.encoding) + end + + def test_bignum_to_s + assert_equal(Encoding::US_ASCII, (1 << 129).to_s.encoding) + end + + def test_array_to_s + assert_equal(Encoding::US_ASCII, [].to_s.encoding) + assert_equal(Encoding::US_ASCII, [nil].to_s.encoding) + assert_equal(Encoding::US_ASCII, [1].to_s.encoding) + assert_equal("".inspect.encoding, [""].to_s.encoding) + assert_equal("a".inspect.encoding, ["a"].to_s.encoding) + assert_equal(Encoding::US_ASCII, [nil,1,"","a","\x20",[]].to_s.encoding) + end + + def test_hash_to_s + assert_equal(Encoding::US_ASCII, {}.to_s.encoding) + assert_equal(Encoding::US_ASCII, {1=>nil,"foo"=>""}.to_s.encoding) + end + + def test_encoding_find + assert_raise(TypeError) {Encoding.find(nil)} + assert_raise(TypeError) {Encoding.find(0)} + assert_raise(TypeError) {Encoding.find([])} + assert_raise(TypeError) {Encoding.find({})} + end + + def test_encoding_to_s + assert_equal(Encoding::US_ASCII, Encoding::US_ASCII.to_s.encoding) + assert_equal(Encoding::US_ASCII, Encoding::US_ASCII.inspect.encoding) + end + + def test_regexp_source + s = "\xa4\xa2".force_encoding("euc-jp") + r = Regexp.new(s) + t = r.source + assert_equal(s, t, "[ruby-dev:33377] Regexp.new(#{encdump s}).source") + end + + def test_magic_comment + assert_equal(Encoding::US_ASCII, eval("__ENCODING__".force_encoding("US-ASCII"))) + assert_equal(Encoding::ASCII_8BIT, eval("__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::US_ASCII, eval("# -*- encoding: US-ASCII -*-\n__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::ASCII_8BIT, eval("# -*- encoding: ASCII-8BIT -*-\n__ENCODING__".force_encoding("US-ASCII"))) + end + + def test_magic_comment_vim + assert_equal(Encoding::US_ASCII, eval("# vim: filetype=ruby, fileencoding: US-ASCII, ts=3, sw=3\n__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::ASCII_8BIT, eval("# vim: filetype=ruby, fileencoding: ASCII-8BIT, ts=3, sw=3\n__ENCODING__".force_encoding("US-ASCII"))) + end + + def test_magic_comment_at_various_positions + # after shebang + assert_equal(Encoding::US_ASCII, eval("#!/usr/bin/ruby\n# -*- encoding: US-ASCII -*-\n__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::ASCII_8BIT, eval("#!/usr/bin/ruby\n# -*- encoding: ASCII-8BIT -*-\n__ENCODING__".force_encoding("US-ASCII"))) + # wrong position + assert_equal(Encoding::ASCII_8BIT, eval("\n# -*- encoding: US-ASCII -*-\n__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::US_ASCII, eval("\n# -*- encoding: ASCII-8BIT -*-\n__ENCODING__".force_encoding("US-ASCII"))) + + # leading expressions + assert_equal(Encoding::ASCII_8BIT, eval("v=1 # -*- encoding: US-ASCII -*-\n__ENCODING__".force_encoding("ASCII-8BIT"))) + assert_equal(Encoding::US_ASCII, eval("v=1 # -*- encoding: ASCII-8BIT -*-\n__ENCODING__".force_encoding("US-ASCII"))) + end + + def test_regexp_usascii + tests = [ + [__LINE__, '//', Encoding::US_ASCII], + [__LINE__, '/#{ }/', Encoding::US_ASCII], + [__LINE__, '/#{"a"}/', Encoding::US_ASCII], + [__LINE__, '/#{%q"\x80"}/', Encoding::US_ASCII], + [__LINE__, '/#{"\x80"}/', Encoding::ASCII_8BIT], + + [__LINE__, '/a/', Encoding::US_ASCII], + [__LINE__, '/a#{ }/', Encoding::US_ASCII], + [__LINE__, '/a#{"a"}/', Encoding::US_ASCII], + [__LINE__, '/a#{%q"\x80"}/', Encoding::ASCII_8BIT], + [__LINE__, '/a#{"\x80"}/', Encoding::ASCII_8BIT], + + [__LINE__, '/\x80/', Encoding::ASCII_8BIT], + [__LINE__, '/\x80#{ }/', Encoding::ASCII_8BIT], + [__LINE__, '/\x80#{"a"}/', Encoding::ASCII_8BIT], + [__LINE__, '/\x80#{%q"\x80"}/', Encoding::ASCII_8BIT], + [__LINE__, '/\x80#{"\x80"}/', Encoding::ASCII_8BIT], + + [__LINE__, '/\u1234/', Encoding::UTF_8], + [__LINE__, '/\u1234#{ }/', Encoding::UTF_8], + [__LINE__, '/\u1234#{"a"}/', Encoding::UTF_8], + + [__LINE__, '/\u1234#{%q"\x80"}/', nil, SyntaxError], + [__LINE__, '/\u1234#{"\x80"}/', nil, SyntaxError], + [__LINE__, '/\u1234\x80/', nil, SyntaxError], + [__LINE__, '/\u1234#{ }\x80/', nil, RegexpError], + ] + all_assertions_foreach(nil, *tests) do |line, r, enc, ex| + code = "# -*- encoding: US-ASCII -*-\n#{r}.encoding" + if ex + assert_raise(ex) {eval(code, nil, __FILE__, line-1)} + else + assert_equal(enc, eval(code, nil, __FILE__, line-1)) + end + end + end + + def test_gbk + assert_equal("", "\x81\x40".force_encoding("GBK").chop) + end + + def test_euc_tw + assert_equal("a", "a\x8e\xa2\xa1\xa1".force_encoding("euc-tw").chop) + end + + def test_valid_encoding + s = "\xa1".force_encoding("euc-jp") + assert_equal(false, s.valid_encoding?) + assert_equal(true, (s+s).valid_encoding?, "[ruby-dev:33826]") + assert_equal(true, (s*2).valid_encoding?, "[ruby-dev:33826]") + assert_equal(true, ("%s%s" % [s, s]).valid_encoding?) + assert_equal(true, (s.dup << s).valid_encoding?) + assert_equal(true, "".center(2, s).valid_encoding?) + + s = "\xa1\xa1\x8f".force_encoding("euc-jp") + assert_equal(false, s.valid_encoding?) + assert_equal(true, s.reverse.valid_encoding?) + + bug4018 = '[ruby-core:33027]' + s = "\xa1\xa1".force_encoding("euc-jp") + assert_equal(true, s.valid_encoding?) + s << "\x8f".force_encoding("euc-jp") + assert_equal(false, s.valid_encoding?, bug4018) + s = "aa".force_encoding("utf-16be") + assert_equal(true, s.valid_encoding?) + s << "\xff".force_encoding("utf-16be") + assert_equal(false, s.valid_encoding?, bug4018) + + bug6190 = '[ruby-core:43557]' + s = "\xe9" + s = s.encode("utf-8", "utf-8") + assert_equal(false, s.valid_encoding?, bug6190) + s = "\xe9" + s.encode!("utf-8", "utf-8") + assert_equal(false, s.valid_encoding?, bug6190) + end + + def test_getbyte + assert_equal(0x82, u("\xE3\x81\x82\xE3\x81\x84").getbyte(2)) + assert_equal(0x82, u("\xE3\x81\x82\xE3\x81\x84").getbyte(-4)) + assert_nil(u("\xE3\x81\x82\xE3\x81\x84").getbyte(100)) + end + + def test_setbyte + s = u("\xE3\x81\x82\xE3\x81\x84") + s.setbyte(2, 0x84) + assert_equal(u("\xE3\x81\x84\xE3\x81\x84"), s) + + s = u("\xE3\x81\x82\xE3\x81\x84") + assert_raise(IndexError) { s.setbyte(100, 0) } + + s = u("\xE3\x81\x82\xE3\x81\x84") + s.setbyte(-4, 0x84) + assert_equal(u("\xE3\x81\x84\xE3\x81\x84"), s) + + x = "x" * 100 + t = nil + failure = proc {"#{i}: #{encdump(t)}"} + + s = "\u{3042 3044}" + s.bytesize.times {|i| + t = s + x + t.setbyte(i, t.getbyte(i)+1) + assert_predicate(t, :valid_encoding?, failure) + assert_not_predicate(t, :ascii_only?, failure) + t = s + x + t.setbyte(i, 0x20) + assert_not_predicate(t, :valid_encoding?, failure) + } + + s = "\u{41 42 43}" + s.bytesize.times {|i| + t = s + x + t.setbyte(i, 0x20) + assert_predicate(t, :valid_encoding?, failure) + assert_predicate(t, :ascii_only?, failure) + t.setbyte(i, 0xe3) + assert_not_predicate(t, :valid_encoding?, failure) + } + end + + def test_setbyte_range + s = u("\xE3\x81\x82\xE3\x81\x84") + assert_nothing_raised { s.setbyte(0, -1) } + assert_nothing_raised { s.setbyte(0, 0x00) } + assert_nothing_raised { s.setbyte(0, 0x7F) } + assert_nothing_raised { s.setbyte(0, 0x80) } + assert_nothing_raised { s.setbyte(0, 0xff) } + assert_nothing_raised { s.setbyte(0, 0x100) } + assert_nothing_raised { s.setbyte(0, 0x4f7574206f6620636861722072616e6765) } + end + + def test_compatible + assert_nil Encoding.compatible?("",0) + assert_equal(Encoding::UTF_8, Encoding.compatible?(u(""), ua("abc"))) + assert_equal(Encoding::UTF_8, Encoding.compatible?(Encoding::UTF_8, Encoding::UTF_8)) + assert_equal(Encoding::UTF_8, Encoding.compatible?(Encoding::UTF_8, Encoding::US_ASCII)) + assert_equal(Encoding::ASCII_8BIT, + Encoding.compatible?(Encoding::ASCII_8BIT, Encoding::US_ASCII)) + assert_nil Encoding.compatible?(Encoding::UTF_8, Encoding::ASCII_8BIT) + end + + def test_force_encoding + assert_equal(u("\x80"), "".center(1, u("\x80")), + "moved from btest/knownbug, [ruby-dev:33807]") + a = "".force_encoding("ascii-8bit") << 0xC3 << 0xB6 + assert_equal(1, a.force_encoding("utf-8").size, '[ruby-core:22437]') + b = "".force_encoding("ascii-8bit") << 0xC3.chr << 0xB6.chr + assert_equal(1, b.force_encoding("utf-8").size, '[ruby-core:22437]') + + assert_raise(TypeError){ ''.force_encoding(nil) } + end + + def test_combchar_codepoint + assert_equal([0x30BB, 0x309A], "\u30BB\u309A".codepoints.to_a) + assert_equal([0x30BB, 0x309A], "\u30BB\u309A".codepoints.to_a) + end + + def each_encoding(*strings) + Encoding.list.each do |enc| + next if enc.dummy? + strs = strings.map {|s| s.encode(enc)} rescue next + yield(*strs) + end + end + + def test_str_b + s = "\u3042" + assert_equal(a("\xE3\x81\x82"), s.b) + assert_equal(Encoding::ASCII_8BIT, s.b.encoding) + s = "abc".b + assert_predicate(s.b, :ascii_only?) + end + + def test_scrub_valid_string + str = "foo" + assert_equal(str, str.scrub) + assert_not_same(str, str.scrub) + str = "\u3042\u3044" + assert_equal(str, str.scrub) + assert_not_same(str, str.scrub) + str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding + assert_equal(str, str.scrub) + assert_not_same(str, str.scrub) + assert_nothing_raised(ArgumentError) {str.scrub(nil)} + end + + def test_scrub_modification_inside_block + str = ("abc\u3042".b << "\xE3\x80".b).force_encoding('UTF-8') + assert_raise(RuntimeError) {str.scrub{|_| str << "1234567890"; "?" }} + + str = "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE) + assert_raise(RuntimeError) do + str.scrub do |_| + str << "1\x002\x00".force_encoding('UTF-16LE') + "?\x00".force_encoding('UTF-16LE') + end + end + end + + def test_scrub_replace_default + assert_equal("\uFFFD\uFFFD\uFFFD", u("\x80\x80\x80").scrub) + assert_equal("\uFFFDA", u("\xF4\x80\x80A").scrub) + + # examples in Unicode 6.1.0 D93b + assert_equal("\x41\uFFFD\uFFFD\x41\uFFFD\x41", + u("\x41\xC0\xAF\x41\xF4\x80\x80\x41").scrub) + assert_equal("\x41\uFFFD\uFFFD\uFFFD\x41", + u("\x41\xE0\x9F\x80\x41").scrub) + assert_equal("\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + assert_equal("abcdefghijklmnopqrstuvwxyz\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064", + u("abcdefghijklmnopqrstuvwxyz\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64").scrub) + end + + def test_scrub_replace_argument + assert_equal("foo", u("foo").scrub("\u3013")) + assert_equal("\u3042\u3044", u("\xE3\x81\x82\xE3\x81\x84").scrub("\u3013")) + assert_equal("\u3042\u3013", u("\xE3\x81\x82\xE3\x81").scrub("\u3013")) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub(e("\xA4\xA2")) } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub(1) } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub(u("\x81")) } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub(e("\xA2\xAE"))) + end + + def test_scrub_replace_block + assert_equal("\u3042<e381>", u("\xE3\x81\x82\xE3\x81").scrub{|x|'<'+x.unpack('H*')[0]+'>'}) + assert_raise(Encoding::CompatibilityError){ u("\xE3\x81\x82\xE3\x81").scrub{e("\xA4\xA2")} } + assert_raise(TypeError){ u("\xE3\x81\x82\xE3\x81").scrub{1} } + assert_raise(ArgumentError){ u("\xE3\x81\x82\xE3\x81\x82\xE3\x81").scrub{u("\x81")} } + assert_equal(e("\xA4\xA2\xA2\xAE"), e("\xA4\xA2\xA4").scrub{e("\xA2\xAE")}) + + assert_equal(u("\x81"), u("a\x81c").scrub {|c| break c}) + assert_raise(ArgumentError) {u("a\x81").scrub {|c| c}} + assert_raise(ArgumentError) {u("a").scrub("?") {|c| c}} + end + + def test_scrub_widechar + assert_equal("\uFFFD\u3042".encode("UTF-16BE"), + "\xD8\x00\x30\x42".force_encoding(Encoding::UTF_16BE). + scrub) + assert_equal("\uFFFD\u3042".encode("UTF-16LE"), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + scrub) + assert_equal("\uFFFD".encode("UTF-32BE"), + "\xff".force_encoding(Encoding::UTF_32BE). + scrub) + assert_equal("\uFFFD".encode("UTF-32LE"), + "\xff".force_encoding(Encoding::UTF_32LE). + scrub) + c = nil + assert_equal("?\u3042".encode(Encoding::UTF_16LE), + "\x00\xD8\x42\x30".force_encoding(Encoding::UTF_16LE). + scrub {|e| c = e; "?".encode(Encoding::UTF_16LE)}) + assert_equal("\x00\xD8".force_encoding(Encoding::UTF_16LE), c) + assert_raise(ArgumentError) {"\uFFFD\u3042".encode("UTF-16BE").scrub("") {}} + end + + def test_scrub_dummy_encoding + s = "\u{3042}".encode("iso-2022-jp") + assert_equal(s, s.scrub) + assert_equal(s, s.force_encoding("iso-2022-jp").scrub("?")) + end + + def test_scrub_bang + str = "\u3042\u3044" + assert_same(str, str.scrub!) + str.force_encoding(Encoding::ISO_2022_JP) # dummy encoding + assert_same(str, str.scrub!) + assert_nothing_raised(ArgumentError) {str.scrub!(nil)} + + str = u("\x80\x80\x80") + str.scrub! + assert_same(str, str.scrub!) + assert_equal("\uFFFD\uFFFD\uFFFD", str) + end + + def test_escaped_metachar + bug10670 = '[ruby-core:67193] [Bug #10670]' + + escape_plain = /\A[\x5B]*\z/.freeze + + assert_match(escape_plain, 0x5b.chr(::Encoding::UTF_8), bug10670) + assert_match(escape_plain, 0x5b.chr, bug10670) + end + + def test_inspect_with_default_internal + bug11787 = '[ruby-dev:49415] [Bug #11787]' + + s = EnvUtil.with_default_internal(::Encoding::EUC_JP) do + [e("\xB4\xC1\xBB\xFA")].inspect + end + assert_equal(e("[\"\xB4\xC1\xBB\xFA\"]"), s, bug11787) + end + + def test_encoding_names_of_default_internal + # [Bug #20595] [Bug #20598] + [ + "default_internal.names", + "name_list", + "aliases.keys" + ].each do |method| + assert_separately(%w(-W0), <<~RUBY) + exp_name = "int" + "ernal" + Encoding.default_internal = Encoding::ASCII_8BIT + name = Encoding.#{method}.find { |x| x == exp_name } + Encoding.default_internal = nil + assert_equal exp_name, name, "Encoding.#{method} [Bug #20595] [Bug #20598]" + RUBY + end + end + + def test_greek_capital_gap + bug12204 = '[ruby-core:74478] [Bug #12204] GREEK CAPITAL RHO and SIGMA' + assert_equal("\u03A3", "\u03A1".succ, bug12204) + end +end |
