summaryrefslogtreecommitdiff
path: root/test/ruby/test_transcode.rb
diff options
context:
space:
mode:
authorMartin Dürst <duerst@it.aoyama.ac.jp>2019-07-14 10:58:50 +0900
committerMartin Dürst <duerst@it.aoyama.ac.jp>2019-07-14 10:58:50 +0900
commit369ff79394765ce198ac7cee872a8c739d895aaa (patch)
treed373a8e2a3b835f981a85dbb0e25e93730fba776 /test/ruby/test_transcode.rb
parentac2866005b96baf986072f86ecd3dfd887f2bda3 (diff)
add encoding conversion from/to CESU-8
Add encoding conversion (transcoding) from UTF-8 to CESU-8 and back. CESU-8 is an encoding similar to UTF-8, but encodes codepoints above U+FFFF as two surrogates, these surrogates again being encoded as if they were UTF-8 codepoints. This preserves the same binary sorting order as in UTF-16. It is also somewhat similar (although not exactly identical) to an encoding used internally by Java. This completes issue #15995. enc/trans/cesu_8.trans: Add encoding conversion from/to CESU-8 test/ruby/test_transcode.rb: Add tests for above
Diffstat (limited to 'test/ruby/test_transcode.rb')
-rw-r--r--test/ruby/test_transcode.rb22
1 files changed, 22 insertions, 0 deletions
diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb
index 44d238ffd2..f405877dd5 100644
--- a/test/ruby/test_transcode.rb
+++ b/test/ruby/test_transcode.rb
@@ -2116,6 +2116,28 @@ class TestTranscode < Test::Unit::TestCase
check_both_ways("D\u00FCrst", "\xC4\xDC\x99\xA2\xA3", 'IBM037') # Dürst
end
+ def test_CESU_8
+ check_both_ways("aijrszAIJRSZ09", "aijrszAIJRSZ09", 'CESU-8') # single bytes
+
+ # check NULL explicitly
+ # this is different in CESU-8 and in Java modified UTF-8 strings
+ check_both_ways("\0", "\0", 'CESU-8')
+
+ # U+0080 U+00FC U+00FF U+0100 U+0400 U+0700 U+07FF
+ two_byte_chars = "\xC2\x80\x20\xC3\xBC\x20\xC3\xBF\x20\xC4\x80\x20\xD0\x80\x20\xDC\x80\x20\xDF\xBF"
+ check_both_ways(two_byte_chars, two_byte_chars, 'CESU-8')
+
+ # U+0800 U+2200 U+4E00 U+D7FF U+E000 U+FFFF
+ three_byte_chars = "\xE0\xA0\x80\x20\xE2\x88\x80\x20\xE4\xB8\x80\x20\xED\x9F\xBF\x20\xEE\x80\x80\x20\xEF\xBF\xBF"
+ check_both_ways(three_byte_chars, three_byte_chars, 'CESU-8')
+
+ # characters outside BMP (double surrogates in CESU-8)
+ # U+10000 U+20000 U+50000 U+10FFFF
+ utf8 = "\xF0\x90\x80\x80 \xF0\xA0\x80\x80 \xF1\x90\x80\x80 \xF4\x8F\xBF\xBF"
+ cesu = "\xED\xA0\x80\xED\xB0\x80 \xED\xA1\x80\xED\xB0\x80 \xED\xA4\x80\xED\xB0\x80 \xED\xAF\xBF\xED\xBF\xBF"
+ check_both_ways(utf8, cesu, 'CESU-8')
+ end
+
def test_nothing_changed
a = "James".force_encoding("US-ASCII")
b = a.encode("Shift_JIS")