diff options
author | Martin Dürst <duerst@it.aoyama.ac.jp> | 2019-07-14 10:58:50 +0900 |
---|---|---|
committer | Martin Dürst <duerst@it.aoyama.ac.jp> | 2019-07-14 10:58:50 +0900 |
commit | 369ff79394765ce198ac7cee872a8c739d895aaa (patch) | |
tree | d373a8e2a3b835f981a85dbb0e25e93730fba776 /enc | |
parent | ac2866005b96baf986072f86ecd3dfd887f2bda3 (diff) |
add encoding conversion from/to CESU-8
Add encoding conversion (transcoding) from UTF-8 to CESU-8
and back. CESU-8 is an encoding similar to UTF-8, but encodes
codepoints above U+FFFF as two surrogates, these surrogates
again being encoded as if they were UTF-8 codepoints. This
preserves the same binary sorting order as in UTF-16. It is
also somewhat similar (although not exactly identical) to an
encoding used internally by Java.
This completes issue #15995.
enc/trans/cesu_8.trans: Add encoding conversion from/to CESU-8
test/ruby/test_transcode.rb: Add tests for above
Diffstat (limited to 'enc')
-rw-r--r-- | enc/trans/cesu_8.trans | 85 |
1 files changed, 85 insertions, 0 deletions
diff --git a/enc/trans/cesu_8.trans b/enc/trans/cesu_8.trans new file mode 100644 index 0000000000..4e17b1ddbb --- /dev/null +++ b/enc/trans/cesu_8.trans @@ -0,0 +1,85 @@ +#include "transcode_data.h" + +<% + map = {} + map["{00-7f}"] = :nomap + map["{c2-df}{80-bf}"] = :nomap + map["e0{a0-bf}{80-bf}"] = :nomap + map["{e1-ec}{80-bf}{80-bf}"] = :nomap + map["ed{80-9f}{80-bf}"] = :nomap + map["{ee-ef}{80-bf}{80-bf}"] = :nomap + map["ed{a0-af}{80-bf}ed{b0-bf}{80-bf}"] = :func_so # surrogate pairs + transcode_generate_node(ActionMap.parse(map), "from_CESU_8") + + map = {} + map["{00-7f}"] = :nomap + map["{c2-df}{80-bf}"] = :nomap + map["e0{a0-bf}{80-bf}"] = :nomap + map["{e1-ec}{80-bf}{80-bf}"] = :nomap + map["ed{80-9f}{80-bf}"] = :nomap + map["{ee-ef}{80-bf}{80-bf}"] = :nomap + map["f0{90-bf}{80-bf}{80-bf}"] = :func_so # planes 1-3 + map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so # planes 4-15 + map["f4{80-8f}{80-bf}{80-bf}"] = :func_so # plane 16 + transcode_generate_node(ActionMap.parse(map), "to_CESU_8") +%> + +<%= transcode_generated_code %> + +static ssize_t +fun_so_from_cesu_8(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize) +{ + unsigned int scalar = ( ((s[1]&0x0F)<<16) | ((s[2]&0x3F)<<10) + | ((s[4]&0x0F)<< 6) | (s[5]&0x3F) + ) + 0x10000; + o[0] = 0xF0 | (scalar>>18); + o[1] = 0x80 | ((scalar>>12)&0x3F); + o[2] = 0x80 | ((scalar>> 6)&0x3F); + o[3] = 0x80 | ( scalar &0x3F); + return 4; +} + +static ssize_t +fun_so_to_cesu_8(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize) +{ + unsigned int scalar = ((s[0]&0x07)<<18) | ((s[1]&0x3F)<<12) + | ((s[2]&0x3F)<< 6) | (s[3]&0x3F); + scalar -= 0x10000; + o[0] = 0xED; + o[1] = 0xA0 | (scalar>>16); + o[2] = 0x80 | ((scalar>>10)&0x3F); + o[3] = 0xED; + o[4] = 0xB0 | ((scalar>> 6)&0x0F); + o[5] = 0x80 | (scalar &0x3F); + return 6; +} + +static const rb_transcoder +rb_from_CESU_8 = { + "CESU-8", "UTF-8", from_CESU_8, + TRANSCODE_TABLE_INFO, + 1, /* input_unit_length */ + 6, /* max_input */ + 4, /* max_output */ + asciicompat_decoder, /* asciicompat_type */ + 0, NULL, NULL, /* state_size, state_init, state_fini */ + NULL, NULL, NULL, fun_so_from_cesu_8 +}; + +static const rb_transcoder +rb_to_CESU_8 = { + "UTF-8", "CESU-8", to_CESU_8, + TRANSCODE_TABLE_INFO, + 1, /* input_unit_length */ + 4, /* max_input */ + 6, /* max_output */ + asciicompat_encoder, /* asciicompat_type */ + 0, NULL, NULL, /* state_size, state_init, state_fini */ + NULL, NULL, NULL, fun_so_to_cesu_8 +}; + +TRANS_INIT(cesu_8) +{ + rb_register_transcoder(&rb_from_CESU_8); + rb_register_transcoder(&rb_to_CESU_8); +} |