diff options
Diffstat (limited to 'ext/nkf/nkf.c')
-rw-r--r-- | ext/nkf/nkf.c | 207 |
1 files changed, 207 insertions, 0 deletions
diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c new file mode 100644 index 0000000000..6b5db0fe70 --- /dev/null +++ b/ext/nkf/nkf.c @@ -0,0 +1,207 @@ +#include "ruby.h" + +#define _AUTO 0 +#define _JIS 1 +#define _EUC 2 +#define _SJIS 3 +#define _BINARY 4 +#define _NOCONV 4 +#define _UNKNOWN _AUTO + +#undef getc +#undef ungetc +#define getc(f) (input_ctr>i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- + +#undef putchar +#define putchar(c) rb_nkf_putchar(c) + +#define INCSIZE 32 +static int incsize; + +static unsigned char *input, *output; +static int input_ctr, i_len; +static int output_ctr, o_len; + +static VALUE dst; + +static int +rb_nkf_putchar(c) + unsigned int c; +{ + if (output_ctr >= o_len) { + o_len += incsize; + rb_str_cat(dst, "", incsize); + incsize *= 2; + } + + output[output_ctr++] = c; +/* +printf("[[%c][%c][%d]]\n", c, output[output_ctr - 1], output_ctr); +*/ + return c; +} + +#define PERL_XS 1 +#include "orig/nkf.c" + +static VALUE +rb_nkf_kconv(obj, opt, src) + VALUE obj, opt, src; +{ + int i; + char *opt_ptr, *opt_end; + + reinit(); + opt_ptr = str2cstr(opt, &i); + opt_end = opt_ptr + i; + for (; opt_ptr < opt_end; opt_ptr++) { + if (*opt_ptr != '-') { + continue; + } + arguments(opt_ptr); + } + dst = rb_str_new(0, RSTRING(src)->len*3 + 10); /* large enough? */ + + incsize = INCSIZE; + + input_ctr = 0; + input = str2cstr(src, &i_len); + + output_ctr = 0; + output = RSTRING(dst)->ptr; + o_len = RSTRING(dst)->len; + *output = '\0'; + + if(iso8859_f && (oconv != j_oconv || !x0201_f )) { + iso8859_f = FALSE; + } + + kanji_convert(NULL); + output_ctr--; + if (output[output_ctr] == '\0') { +/* +printf("([%c][%d])\n", output[output_ctr], output_ctr); +*/ + RSTRING(dst)->len = output_ctr; + } else { +/* +printf("<[%c][%d]>\n", output[output_ctr], output_ctr); +*/ + RSTRING(dst)->len = output_ctr + 1; + } + + return dst; +} + +/* + * Character code detection - Algorithm described in: + * Ken Lunde. `Understanding Japanese Information Processing' + * Sebastopol, CA: O'Reilly & Associates. + */ + +static VALUE +rb_nkf_guess(obj, src) + VALUE obj, src; +{ + unsigned char *p; + unsigned char *pend; + int sequence_counter = 0; + + Check_Type(src, T_STRING); + + p = RSTRING(src)->ptr; + pend = p + RSTRING(src)->len; + +#define INCR do {\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ + sequence_counter = 0;\ + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ +} while (0) + + if (*p == 0xa4) + sequence_counter = 1; + + while (p<pend) { + if (*p == '\033') { + return INT2FIX(_JIS); + } + if ('\000' < *p && *p < '\006' + || *p == 0x7f + || *p == 0xdf) { + return INT2FIX(_BINARY); + } + if (0x81 <= *p && *p <= 0x8d) { + return INT2FIX(_SJIS); + } + if (0x8f <= *p && *p <= 0x9f) { + return INT2FIX(_SJIS); + } + if (*p == 0x8e) { /* SS2 */ + INCR; + if ((0x40 <= *p && *p <= 0x7e) || + (0x80 <= *p && *p <= 0xa0) || + (0xe0 <= *p && *p <= 0xfc)) + return INT2FIX(_SJIS); + } + else if (0xa1 <= *p && *p <= 0xdf) { + INCR; + if (0xf0 <= *p && *p <= 0xfe) + return INT2FIX(_EUC); + if (0xe0 <= *p && *p <= 0xef) { + while (p < pend && *p >= 0x40) { + if (*p >= 0x81) { + if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) { + return INT2FIX(_SJIS); + } + else if (0xfd <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + } + INCR; + } + } + else if (*p <= 0x9f) { + return INT2FIX(_SJIS); + } + } + else if (0xf0 <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + else if (0xe0 <= *p && *p <= 0xef) { + INCR; + if ((0x40 <= *p && *p <= 0x7e) || + (0x80 <= *p && *p <= 0xa0)) { + return INT2FIX(_SJIS); + } + if (0xfd <= *p && *p <= 0xfe) { + return INT2FIX(_EUC); + } + } + INCR; + } + return INT2FIX(_UNKNOWN); +} + +void +Init_nkf() +{ + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); +} |