summaryrefslogtreecommitdiff
path: root/enc
diff options
context:
space:
mode:
authorakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-07 14:53:30 +0000
committerakr <akr@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-08-07 14:53:30 +0000
commit1504652373a16c8e7eb5d59894c83572ac72b5e7 (patch)
tree369ab68905e8fd3d61f440a4b56e4781d433efde /enc
parent05373c446902ace063b1ff5caf2236004e7bc763 (diff)
* transcode_data.h (rb_transcoding): new field "stateful".
(rb_transcoder): preprocessor and postprocessor field removed. change arguments of func_ii, func_si, func_io and func_so. new field "finish_func". * tool/transcode-tblgen.rb: make FUNii, FUNsi and FUNio generatable. * transcode.c (transcoder_lib_table): removed. (transcoder_table): change structure. (transcoder_key): removed because the above structure change. (make_transcoder_entry): new function. (get_transcoder_entry): ditto. (rb_register_transcoder): follow the structure change. (declare_transcoder): ditto. (transcode_search_path): new function for breadth first search to find a list of converters. (transcode_search_path_i): new function. (transcode_dispatch_cb): ditto. (transcode_dispatch): use transcode_search_path. (transcode_loop): follow the argument change. (str_transcode): preprocessor and postprocessor stuff removed. * enc/trans/iso2022.erb.c: new file. ISO-2022-JP conversion re-implemented. * enc/trans/japanese.erb.c: ISO-2022-JP stuff removed. nute(23:52:53)% head -40 ChangeLog Thu Aug 7 23:43:11 2008 Tanaka Akira <akr@fsij.org> * transcode_data.h (rb_transcoding): new field "stateful". (rb_transcoder): preprocessor and postprocessor field removed. change arguments of func_ii, func_si, func_io and func_so. new field "finish_func". * tool/transcode-tblgen.rb: make FUNii, FUNsi and FUNio generatable. * transcode.c (transcoder_lib_table): removed. (transcoder_table): change structure. (transcoder_key): removed because the above structure change. (make_transcoder_entry): new function. (get_transcoder_entry): ditto. (rb_register_transcoder): follow the structure change. (declare_transcoder): ditto. (transcode_search_path): new function for breadth first search to find a list of converters. (transcode_search_path_i): new function. (transcode_dispatch_cb): ditto. (transcode_dispatch): use transcode_search_path. (transcode_loop): follow the argument change. (str_transcode): preprocessor and postprocessor stuff removed. * enc/trans/iso2022.erb.c: new file. ISO-2022-JP conversion re-implemented. * enc/trans/japanese.erb.c: ISO-2022-JP stuff removed. * enc/trans/utf_16_32.erb.c: follow argument change of FUNso. [ruby-dev:35798] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18419 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc')
-rw-r--r--enc/trans/iso2022.erb.c142
-rw-r--r--enc/trans/japanese.erb.c227
-rw-r--r--enc/trans/utf_16_32.erb.c32
3 files changed, 158 insertions, 243 deletions
diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c
new file mode 100644
index 0000000000..c3f6be693c
--- /dev/null
+++ b/enc/trans/iso2022.erb.c
@@ -0,0 +1,142 @@
+#include "transcode_data.h"
+
+<%
+ map = {}
+ map["1b2842"] = :func_so # designate US-ASCII to G0. "ESC ( B"
+ map["1b284a"] = :func_so # designate JIS X 0201 latin to G0. "ESC ( J"
+ map["1b2440"] = :func_so # designate JIS X 0208 1978 to G0. "ESC $ @"
+ map["1b2442"] = :func_so # designate JIS X 0208 1983 to G0. "ESC $ B"
+ map["{00-0d,10-1a,1c-7f}"] = :func_si
+
+ map_jisx0208_rest = {}
+ map_jisx0208_rest["{21-7e}"] = :func_so
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %>
+<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %>
+
+static VALUE
+fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l)
+{
+ if (t->stateful[0] == 0)
+ return (VALUE)NOMAP;
+ else if (0x21 <= s[0] && s[0] <= 0x7e)
+ return (VALUE)&iso2022jp_to_eucjp_jisx0208_rest;
+ else
+ return (VALUE)INVALID;
+}
+
+static int
+fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
+{
+ if (s[0] == 0x1b) {
+ if (s[1] == '(') {
+ switch (s[l-1]) {
+ case 'B':
+ case 'J':
+ t->stateful[0] = 0;
+ break;
+ }
+ }
+ else {
+ switch (s[l-1]) {
+ case '@':
+ case 'B':
+ t->stateful[0] = 1;
+ break;
+ }
+ }
+ return 0;
+ }
+ else {
+ o[0] = s[0] | 0x80;
+ o[1] = s[1] | 0x80;
+ return 2;
+ }
+}
+
+static const rb_transcoder
+rb_ISO_2022_JP_to_EUC_JP = {
+ "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0,
+ NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
+};
+
+<%
+ map_eucjp = {
+ "{0e,0f,1b}" => :undef,
+ "{00-0d,10-1a,1c-7f}" => :func_so,
+ "{a1-fe}{a1-fe}" => :func_so,
+ "8e{a1-fe}" => :undef,
+ "8f{a1-fe}{a1-fe}" => :undef,
+ }
+%>
+
+<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %>
+
+static int
+fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o)
+{
+ unsigned char *output0 = o;
+
+ if (t->stateful[0] == 0) {
+ t->stateful[0] = 1; /* initialized flag */
+ t->stateful[1] = 1; /* ASCII mode */
+ }
+
+ if (l != t->stateful[1]) {
+ if (l == 1) {
+ *o++ = 0x1b;
+ *o++ = '(';
+ *o++ = 'B';
+ t->stateful[1] = 1;
+ }
+ else {
+ *o++ = 0x1b;
+ *o++ = '$';
+ *o++ = 'B';
+ t->stateful[1] = 2;
+ }
+ }
+
+ if (l == 1) {
+ *o++ = s[0] & 0x7f;
+ }
+ else {
+ *o++ = s[0] & 0x7f;
+ *o++ = s[1] & 0x7f;
+ }
+
+ return o - output0;
+}
+
+static int
+finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
+{
+ unsigned char *output0 = o;
+
+ if (t->stateful[0] == 0)
+ return 0;
+
+ if (t->stateful[1] != 1) {
+ *o++ = 0x1b;
+ *o++ = '(';
+ *o++ = 'B';
+ t->stateful[1] = 1;
+ }
+
+ return o - output0;
+}
+
+static const rb_transcoder
+rb_EUC_JP_to_ISO_2022_JP = {
+ "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0,
+ NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
+};
+
+void
+Init_iso2022(void)
+{
+ rb_register_transcoder(&rb_ISO_2022_JP_to_EUC_JP);
+ rb_register_transcoder(&rb_EUC_JP_to_ISO_2022_JP);
+}
+
diff --git a/enc/trans/japanese.erb.c b/enc/trans/japanese.erb.c
index dce9ab5932..dae3bf1e03 100644
--- a/enc/trans/japanese.erb.c
+++ b/enc/trans/japanese.erb.c
@@ -17,235 +17,8 @@
<%= transcode_tblgen "UTF-8", "EUC-JP", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %>
<%= transcode_tblgen "UTF-8", "CP51932", [["{00-7f}", :nomap], *UCS_TO_EUCJP_TBL] %>
-#define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte)
-enum ISO_2022_ESCSEQ {
- ISO_2022_CZD = '!',
- ISO_2022_C1D = '"',
- ISO_2022_GZD4 = '(',
- ISO_2022_G1D4 = ')',
- ISO_2022_G2D4 = '*',
- ISO_2022_G3D4 = '+',
- ISO_2022_G1D6 = '-',
- ISO_2022_G2D6 = '.',
- ISO_2022_G3D6 = '/',
- ISO_2022_GZDM4 = ISO_2022_ENCODING('$','('),
- ISO_2022_G1DM4 = ISO_2022_ENCODING('$',')'),
- ISO_2022_G2DM4 = ISO_2022_ENCODING('$','*'),
- ISO_2022_G3DM4 = ISO_2022_ENCODING('$','+'),
- ISO_2022_G1DM6 = ISO_2022_ENCODING('$','-'),
- ISO_2022_G2DM6 = ISO_2022_ENCODING('$','.'),
- ISO_2022_G3DM6 = ISO_2022_ENCODING('$','/'),
- ISO_2022_DOCS = ISO_2022_ENCODING('%','I'),
- ISO_2022_IRR = '&'
-};
-
-
-#define ISO_2022_GZ_ASCII ISO_2022_ENCODING(ISO_2022_GZD4, 'B')
-#define ISO_2022_GZ_JIS_X_0201_Katakana ISO_2022_ENCODING(ISO_2022_GZD4, 'I')
-#define ISO_2022_GZ_JIS_X_0201_Roman ISO_2022_ENCODING(ISO_2022_GZD4, 'J')
-#define ISO_2022_GZ_JIS_C_6226_1978 ISO_2022_ENCODING(ISO_2022_GZDM4,'@')
-#define ISO_2022_GZ_JIS_X_0208_1983 ISO_2022_ENCODING(ISO_2022_GZDM4,'B')
-#define ISO_2022_GZ_JIS_X_0212_1990 ISO_2022_ENCODING(ISO_2022_GZDM4,'D')
-#define ISO_2022_GZ_JIS_X_0213_2000_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'O')
-#define ISO_2022_GZ_JIS_X_0213_2000_2 ISO_2022_ENCODING(ISO_2022_GZDM4,'P')
-#define ISO_2022_GZ_JIS_X_0213_2004_1 ISO_2022_ENCODING(ISO_2022_GZDM4,'Q')
-
-#define UNSUPPORTED_MODE TRANSCODE_ERROR
-
-static int
-get_iso_2022_mode(const unsigned char **in_pos)
-{
- int new_mode;
- const unsigned char *in_p = *in_pos;
- switch (*in_p++) {
- case '(':
- switch (*in_p++) {
- case 'B': case 'I': case 'J':
- new_mode = ISO_2022_ENCODING(ISO_2022_GZD4, *(in_p-1));
- break;
- default:
- rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC ( %c)", *(in_p-1));
- break;
- }
- break;
- case '$':
- switch (*in_p++) {
- case '@': case 'A': case 'B':
- new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
- break;
- case '(':
- switch (*in_p++) {
- case 'D': case 'O': case 'P': case 'Q':
- new_mode = ISO_2022_ENCODING(ISO_2022_GZDM4, *(in_p-1));
- break;
- default:
- rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ ( %c)", *(in_p-1));
- break;
- }
- break;
- default:
- rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC $ %c)", *(in_p-1));
- break;
- }
- break;
- default:
- rb_raise(UNSUPPORTED_MODE, "this mode is not supported (ESC %c)", *(in_p-1));
- break;
- }
- *in_pos = in_p;
- return new_mode;
-}
-
-static void
-from_iso_2022_jp_transcoder_preprocessor(const unsigned char **in_pos, unsigned char **out_pos,
- const unsigned char *in_stop, unsigned char *out_stop,
- rb_transcoding *my_transcoding)
-{
- const rb_transcoder *my_transcoder = my_transcoding->transcoder;
- const unsigned char *in_p = *in_pos;
- unsigned char *out_p = *out_pos;
- int cur_mode = ISO_2022_GZ_ASCII;
- unsigned char c1;
- unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
- while (in_p < in_stop) {
- if (out_p >= out_s) {
- int len = (out_p - *out_pos);
- int new_len = (len + my_transcoder->max_output) * 2;
- *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
- out_p = *out_pos + len;
- out_s = *out_pos + new_len - my_transcoder->max_output;
- }
- c1 = *in_p++;
- if (c1 == 0x1B) {
- cur_mode = get_iso_2022_mode(&in_p);
- }
- else if (c1 == 0x1E || c1 == 0x1F) {
- /* SHIFT */
- rb_raise(UNSUPPORTED_MODE, "shift is not supported");
- }
- else if (c1 >= 0x80) {
- rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
- }
- else {
- switch (cur_mode) {
- case ISO_2022_GZ_ASCII:
- case ISO_2022_GZ_JIS_X_0201_Roman:
- *out_p++ = c1;
- break;
- case ISO_2022_GZ_JIS_X_0201_Katakana:
- *out_p++ = 0x8E;
- *out_p++ = c1 | 0x80;
- break;
- case ISO_2022_GZ_JIS_X_0212_1990:
- *out_p++ = 0x8F;
- case ISO_2022_GZ_JIS_C_6226_1978:
- case ISO_2022_GZ_JIS_X_0208_1983:
- *out_p++ = c1 | 0x80;
- *out_p++ = *in_p++ | 0x80;
- break;
- }
- }
- }
- /* cleanup */
- *in_pos = in_p;
- *out_pos = out_p;
-}
-
-static int
-select_iso_2022_mode(unsigned char **out_pos, int new_mode)
-{
- unsigned char *out_p = *out_pos;
- *out_p++ = '\x1b';
- switch (new_mode>>8) {
- case ISO_2022_GZD4:
- *out_p++ = new_mode >> 8;
- *out_p++ = new_mode & 0x7F;
- break;
- case ISO_2022_GZDM4:
- *out_p++ = new_mode >> 16;
- if ((new_mode & 0x7F) != '@' &&
- (new_mode & 0x7F) != 'A' &&
- (new_mode & 0x7F) != 'B')
- {
- *out_p++ = (new_mode>>8) & 0x7F;
- }
- *out_p++ = new_mode & 0x7F;
- break;
- default:
- rb_raise(UNSUPPORTED_MODE, "this mode is not supported.");
- break;
- }
- *out_pos = out_p;
- return new_mode;
-}
-
-static void
-to_iso_2022_jp_transcoder_postprocessor(const unsigned char **in_pos, unsigned char **out_pos,
- const unsigned char *in_stop, unsigned char *out_stop,
- rb_transcoding *my_transcoding)
-{
- const rb_transcoder *my_transcoder = my_transcoding->transcoder;
- const unsigned char *in_p = *in_pos;
- unsigned char *out_p = *out_pos;
- int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0;
- unsigned char next_byte;
- unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
- while (in_p < in_stop) {
- if (out_p >= out_s) {
- int len = (out_p - *out_pos);
- int new_len = (len + my_transcoder->max_output) * 2;
- *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len);
- out_p = *out_pos + len;
- out_s = *out_pos + new_len - my_transcoder->max_output;
- }
- next_byte = *in_p++;
- if (next_byte < 0x80) {
- new_mode = ISO_2022_GZ_ASCII;
- }
- else if (next_byte == 0x8E) {
- new_mode = ISO_2022_GZ_JIS_X_0201_Katakana;
- next_byte = *in_p++;
- }
- else if (next_byte == 0x8F) {
- new_mode = ISO_2022_GZ_JIS_X_0212_1990;
- next_byte = *in_p++;
- }
- else {
- new_mode = ISO_2022_GZ_JIS_X_0208_1983;
- }
- if (cur_mode != new_mode)
- cur_mode = select_iso_2022_mode(&out_p, new_mode);
- if (cur_mode < 0xFFFF) {
- *out_p++ = next_byte & 0x7F;
- }
- else {
- *out_p++ = next_byte & 0x7F;
- *out_p++ = *in_p++ & 0x7F;
- }
- }
- if (cur_mode != ISO_2022_GZ_ASCII)
- cur_mode = select_iso_2022_mode(&out_p, ISO_2022_GZ_ASCII);
- /* cleanup */
- *in_pos = in_p;
- *out_pos = out_p;
-}
-
-static const rb_transcoder
-rb_from_ISO_2022_JP = {
- "ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0,
- &from_iso_2022_jp_transcoder_preprocessor, NULL,
-};
-
-static const rb_transcoder
-rb_to_ISO_2022_JP = {
- "UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1,
- NULL, &to_iso_2022_jp_transcoder_postprocessor,
-};
-
void
Init_japanese(void)
{
<%= transcode_register_code %>
- rb_register_transcoder(&rb_from_ISO_2022_JP);
- rb_register_transcoder(&rb_to_ISO_2022_JP);
}
diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c
index 5f5af5294e..67f84e74bf 100644
--- a/enc/trans/utf_16_32.erb.c
+++ b/enc/trans/utf_16_32.erb.c
@@ -1,7 +1,7 @@
#include "transcode_data.h"
static int
-fun_so_from_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
if (!s[0] && s[1]<0x80) {
o[0] = s[1];
@@ -29,7 +29,7 @@ fun_so_from_utf_16be(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_to_utf_16be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
if (!(s[0]&0x80)) {
o[0] = 0x00;
@@ -57,7 +57,7 @@ fun_so_to_utf_16be(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_from_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
if (!s[1] && s[0]<0x80) {
o[0] = s[0];
@@ -85,7 +85,7 @@ fun_so_from_utf_16le(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_to_utf_16le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_16le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
if (!(s[0]&0x80)) {
o[1] = 0x00;
@@ -113,7 +113,7 @@ fun_so_to_utf_16le(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_from_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
if (!s[1]) {
if (s[2]==0 && s[3]<0x80) {
@@ -142,7 +142,7 @@ fun_so_from_utf_32be(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_to_utf_32be(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32be(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
o[0] = 0;
if (!(s[0]&0x80)) {
@@ -168,13 +168,13 @@ fun_so_to_utf_32be(const unsigned char* s, unsigned char* o)
}
static int
-fun_so_from_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_from_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
return 1;
}
static int
-fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
+fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
return 4;
}
@@ -191,7 +191,7 @@ fun_so_to_utf_32le(const unsigned char* s, unsigned char* o)
static const rb_transcoder
rb_from_UTF_16BE = {
"UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0,
- NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16be
+ NULL, NULL, NULL, &fun_so_from_utf_16be
};
<%=
@@ -217,7 +217,7 @@ rb_from_UTF_16BE = {
static const rb_transcoder
rb_to_UTF_16BE = {
"UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1,
- NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16be
+ NULL, NULL, NULL, &fun_so_to_utf_16be
};
<%=
@@ -232,13 +232,13 @@ rb_to_UTF_16BE = {
static const rb_transcoder
rb_from_UTF_16LE = {
"UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0,
- NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_16le
+ NULL, NULL, NULL, &fun_so_from_utf_16le
};
static const rb_transcoder
rb_to_UTF_16LE = {
"UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1,
- NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_16le
+ NULL, NULL, NULL, &fun_so_to_utf_16le
};
<%=
@@ -254,13 +254,13 @@ rb_to_UTF_16LE = {
static const rb_transcoder
rb_from_UTF_32BE = {
"UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0,
- NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32be
+ NULL, NULL, NULL, &fun_so_from_utf_32be
};
static const rb_transcoder
rb_to_UTF_32BE = {
"UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1,
- NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32be
+ NULL, NULL, NULL, &fun_so_to_utf_32be
};
<%=
@@ -276,13 +276,13 @@ rb_to_UTF_32BE = {
static const rb_transcoder
rb_from_UTF_32LE = {
"UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0,
- NULL, NULL, NULL, NULL, NULL, &fun_so_from_utf_32le
+ NULL, NULL, NULL, &fun_so_from_utf_32le
};
static const rb_transcoder
rb_to_UTF_32LE = {
"UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1,
- NULL, NULL, NULL, NULL, NULL, &fun_so_to_utf_32le
+ NULL, NULL, NULL, &fun_so_to_utf_32le
};
void