#include "transcode_data.h" <% require 'sjis-tbl' require 'eucjp-tbl' %> <%= transcode_tblgen "Shift_JIS", "UTF-8", [["{00-7f}", :nomap], *SJIS_TO_UCS_TBL] %> <%= transcode_tblgen "Windows-31J", "UTF-8", [["{00-7f}", :nomap], *SJIS_TO_UCS_TBL] %> <%= transcode_tblgen "UTF-8", "Shift_JIS", [["{00-7f}", :nomap], *UCS_TO_SJIS_TBL] %> <%= transcode_tblgen "UTF-8", "Windows-31J", [["{00-7f}", :nomap], *UCS_TO_SJIS_TBL] %> <%= transcode_tblgen "EUC-JP", "UTF-8", [["{00-7f}", :nomap], *EUCJP_TO_UCS_TBL] %> <%= transcode_tblgen "CP51932", "UTF-8", [["{00-7f}", :nomap], *EUCJP_TO_UCS_TBL] %> <%= transcode_tblgen "UTF-8", "EUC-JP", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %> <%= transcode_tblgen "UTF-8", "CP51932", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %> #define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte) enum ISO_2022_ESCSEQ { ISO_2022_CZD = '!', ISO_2022_C1D = '"', ISO_2022_GZD4 = '(', ISO_2022_G1D4 = ')', ISO_2022_G2D4 = '*', ISO_2022_G3D4 = '+', ISO_2022_G1D6 = '-', ISO_2022_G2D6 = '.', ISO_2022_G3D6 = '/', ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('), ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'), ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'), ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'), ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'), ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'), ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'), ISO_2022_DOCS = ISO_2022_ENCODING('%','I'), ISO_2022_IRR = '&' }; #define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B') #define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I') #define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J') #define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@') #define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B') #define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D') #define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O') #define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P') #define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q') #define UNSUPPORTED_MODE TRANSCODE_ERROR static int get_iso_2022_mode(const unsigned char **in_pos) { int new_mode; const unsigned char *in_p = *in_pos; switch (*in_p++) { case '(': switch (*in_p++) { case 'B': case 'I': case 'J': new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1)); break; default: rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( %c)", *(in_p-1)); break; } break; case '$': switch (*in_p++) { case '@': case 'A': case 'B': new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); break; case '(': switch (*in_p++) { case 'D': case 'O': case 'P': case 'Q': new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1)); break; default: rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ ( %c)", *(in_p-1)); break; } break; default: rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ %c)", *(in_p-1)); break; } break; default: rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC %c)", *(in_p-1)); break; } *in_pos = in_p; return new_mode; } static void from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *my_transcoding) { const rb_transcoder *my_transcoder = my_transcoding->transcoder; const unsigned char *in_p = *in_pos; unsigned char *out_p = *out_pos; int cur_mode = ISO_2022_GZ_ASCII; unsigned char c1; unsigned char *out_s = out_stop - my_transcoder->max_output + 1; while (in_p < in_stop) { if (out_p >= out_s) { int len = (out_p - *out_pos); int new_len = (len + my_transcoder->max_output) * 2; *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); out_p = *out_pos + len; out_s = *out_pos + new_len - my_transcoder->max_output; } c1 = *in_p++; if (c1 == 0x1B) { cur_mode = get_iso_2022_mode(&in_p); } else if (c1 == 0x1E || c1 == 0x1F) { /* SHIFT */ rb_raise(UNSUPPORTED_MODE, "shift is not supported"); } else if (c1 >= 0x80) { rb_raise(TRANSCODE_ERROR, "invalid byte sequence"); } else { switch (cur_mode) { case ISO_2022_GZ_ASCII: case ISO_2022_GZ_JIS_X_0201_Roman: *out_p++ = c1; break; case ISO_2022_GZ_JIS_X_0201_Katakana: *out_p++ = 0x8E; *out_p++ = c1 | 0x80; break; case ISO_2022_GZ_JIS_X_0212_1990: *out_p++ = 0x8F; case ISO_2022_GZ_JIS_C_6226_1978: case ISO_2022_GZ_JIS_X_0208_1983: *out_p++ = c1 | 0x80; *out_p++ = *in_p++ | 0x80; break; } } } /* cleanup */ *in_pos = in_p; *out_pos = out_p; } static int select_iso_2022_mode(unsigned char **out_pos, int new_mode) { unsigned char *out_p = *out_pos; *out_p++ = '\x1b'; switch (new_mode>>8) { case ISO_2022_GZD4: *out_p++ = new_mode >> 8; *out_p++ = new_mode & 0x7F; break; case ISO_2022_GZDM4: *out_p++ = new_mode >> 16; if ((new_mode & 0x7F) != '@' && (new_mode & 0x7F) != 'A' && (new_mode & 0x7F) != 'B') { *out_p++ = (new_mode>>8) & 0x7F; } *out_p++ = new_mode & 0x7F; break; default: rb_raise(UNSUPPORTED_MODE, "this mode is not supported."); break; } *out_pos = out_p; return new_mode; } static void to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos, unsigned char **out_pos, const unsigned char *in_stop, unsigned char *out_stop, rb_transcoding *my_transcoding) { const rb_transcoder *my_transcoder = my_transcoding->transcoder; const unsigned char *in_p = *in_pos; unsigned char *out_p = *out_pos; int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0; unsigned char next_byte; unsigned char *out_s = out_stop - my_transcoder->max_output + 1; while (in_p < in_stop) { if (out_p >= out_s) { int len = (out_p - *out_pos); int new_len = (len + my_transcoder->max_output) * 2; *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); out_p = *out_pos + len; out_s = *out_pos + new_len - my_transcoder->max_output; } next_byte = *in_p++; if (next_byte < 0x80) { new_mode = ISO_2022_GZ_ASCII; } else if (next_byte == 0x8E) { new_mode = ISO_2022_GZ_JIS_X_0201_Katakana; next_byte = *in_p++; } else if (next_byte == 0x8F) { new_mode = ISO_2022_GZ_JIS_X_0212_1990; next_byte = *in_p++; } else { new_mode = ISO_2022_GZ_JIS_X_0208_1983; } if (cur_mode != new_mode) cur_mode = select_iso_2022_mode(&out_p, new_mode); if (cur_mode < 0xFFFF) { *out_p++ = next_byte & 0x7F; } else { *out_p++ = next_byte & 0x7F; *out_p++ = *in_p++ & 0x7F; } } if (cur_mode != ISO_2022_GZ_ASCII) cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII); /* cleanup */ *in_pos = in_p; *out_pos = out_p; } static const rb_transcoder rb_from_ISO_2022_JP = { "ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0, &from_iso_2022_jp_transcoder_preprocessor, NULL, }; static const rb_transcoder rb_to_ISO_2022_JP = { "UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1, NULL, &to_iso_2022_jp_transcoder_postprocessor, }; void Init_japanese(void) { <%= transcode_register_code %> rb_register_transcoder(&rb_from_ISO_2022_JP); rb_register_transcoder(&rb_to_ISO_2022_JP); }