#include "ruby.h" #define _AUTO 0 #define _JIS 1 #define _EUC 2 #define _SJIS 3 #define _BINARY 4 #define _NOCONV 4 #define _UNKNOWN _AUTO #undef getc #undef ungetc #define getc(f) (input_ctr= o_len) { o_len += incsize; rb_str_cat(dst, 0, incsize); output = RSTRING(dst)->ptr; incsize *= 2; } output[output_ctr++] = c; return c; } #define PERL_XS 1 #include "nkf1.7/nkf.c" static VALUE rb_nkf_kconv(obj, opt, src) VALUE obj, opt, src; { char *opt_ptr, *opt_end; volatile VALUE v; reinit(); StringValue(opt); opt_ptr = RSTRING(opt)->ptr; opt_end = opt_ptr + RSTRING(opt)->len; for (; opt_ptr < opt_end; opt_ptr++) { if (*opt_ptr != '-') { continue; } arguments(opt_ptr); } incsize = INCSIZE; input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; dst = rb_str_new(0, i_len*3 + 10); v = dst; output_ctr = 0; output = RSTRING(dst)->ptr; o_len = RSTRING(dst)->len; *output = '\0'; if(iso8859_f && (oconv != j_oconv || !x0201_f )) { iso8859_f = FALSE; } kanji_convert(NULL); RSTRING(dst)->ptr[output_ctr] = '\0'; RSTRING(dst)->len = output_ctr; OBJ_INFECT(dst, src); return dst; } /* * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE rb_nkf_guess(obj, src) VALUE obj, src; { unsigned char *p; unsigned char *pend; int sequence_counter = 0; StringValue(src); p = RSTRING(src)->ptr; pend = p + RSTRING(src)->len; if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ p++;\ if (p==pend) return INT2FIX(_UNKNOWN);\ sequence_counter++;\ if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ if (6 <= sequence_counter) {\ sequence_counter = 0;\ return INT2FIX(_EUC);\ }\ } while (0) if (*p == 0xa4) sequence_counter = 1; while (p= 0x40) { if (*p >= 0x81) { if (*p <= 0x8d || (0x8f <= *p && *p <= 0x9f)) { return INT2FIX(_SJIS); } else if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } } else if (*p <= 0x9f) { return INT2FIX(_SJIS); } } else if (0xf0 <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } else if (0xe0 <= *p && *p <= 0xef) { INCR; if ((0x40 <= *p && *p <= 0x7e) || (0x80 <= *p && *p <= 0xa0)) { return INT2FIX(_SJIS); } if (0xfd <= *p && *p <= 0xfe) { return INT2FIX(_EUC); } } INCR; } return INT2FIX(_UNKNOWN); } void Init_nkf() { VALUE mKconv = rb_define_module("NKF"); rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); }