From dbe30b383b6d63ea3395b79719d53616beb86e46 Mon Sep 17 00:00:00 2001 From: naruse Date: Wed, 3 Nov 2004 07:30:42 +0000 Subject: * follow nkf 2.0.4 git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_8@7186 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ext/nkf/depend | 2 +- ext/nkf/lib/kconv.rb | 197 +++++- ext/nkf/nkf.c | 191 +++-- ext/nkf/nkf1.7/nkf.c | 1900 -------------------------------------------------- ext/nkf/test.rb | 538 ++++++++++++-- 5 files changed, 780 insertions(+), 2048 deletions(-) delete mode 100644 ext/nkf/nkf1.7/nkf.c diff --git a/ext/nkf/depend b/ext/nkf/depend index 13e32e6074..0ed8fea8d2 100644 --- a/ext/nkf/depend +++ b/ext/nkf/depend @@ -1 +1 @@ -nkf.o : nkf.c $(hdrdir)/ruby.h $(topdir)/config.h $(hdrdir)/defines.h $(srcdir)/nkf1.7/nkf.c +nkf.o : nkf.c $(hdrdir)/ruby.h $(topdir)/config.h $(hdrdir)/defines.h $(srcdir)/nkf-utf8/nkf.c $(srcdir)/nkf-utf8/utf8tbl.c $(srcdir)/nkf-utf8/config.h diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb index af6d82275f..1fd28a5a59 100644 --- a/ext/nkf/lib/kconv.rb +++ b/ext/nkf/lib/kconv.rb @@ -1,73 +1,226 @@ require 'nkf' module Kconv - AUTO = NKF::AUTO - JIS = NKF::JIS - EUC = NKF::EUC - SJIS = NKF::SJIS - BINARY = NKF::BINARY - NOCONV = NKF::NOCONV - UNKNOWN = NKF::UNKNOWN + #Constant of Encoding + AUTO = ::NKF::AUTO + JIS = ::NKF::JIS + EUC = ::NKF::EUC + SJIS = ::NKF::SJIS + BINARY = ::NKF::BINARY + NOCONV = ::NKF::NOCONV + ASCII = ::NKF::ASCII + UTF8 = ::NKF::UTF8 + UTF16 = ::NKF::UTF16 + UTF32 = ::NKF::UTF32 + UNKNOWN = ::NKF::UNKNOWN + + #Regexp of Encoding + Iconv_Shift_JIS = /\A(?: + [\x00-\x7f\xa1-\xdf] | + \x81[\x40-\x7e\x80-\xac\xb8-\xbf\xc8-\xce\xda-\xe8\xf0-\xf7\xfc] | + \x82[\x4f-\x58\x60-\x79\x81-\x9a\x9f-\xf1] | + \x83[\x40-\x7e\x80-\x96\x9f-\xb6\xbf-\xd6\x40-\x60] | + \x84[\x40-\x60\x70-\x7e\x80-\x91\x9f-\xbe\x9f-\xfc] | + [\x89-\x8f\x90-\x97\x99-\x9f\xe0-\xea][\x40-\x7e] | + [\x89-\x97\x99-\x9f\xe0-\xe9][\x80-\xfc] | + \x98[\x40-\x72\x9f-\xfc] | + \xea[\x80-\xa4] + )*\z/nx + Iconv_EUC_JP = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1\xb0-\xbce\xd0-\xf3][\xa1-\xfe] | + \xa2[\xa1-\xae\xba-\xc1\xca-\xd0\xdc-\xea\xf2-\xf9\xfe] | + \xa3[\xb0-\xb9\xc1-\xda\xe1-\xfa] | + \xa4[\xa1-\xf3] | + \xa5[\xa1-\xf6] | + \xa6[\xa1-\xb8\xc1-\xd8] | + \xa7[\xa1-\xc1\xd1-\xf1] | + \xa8[\xa1-\xc0] | + \xcf[\xa1-\xd3] | + \xf4[\xa1-\xa6] + )*\z/nx + Iconv_UTF8 = /\A(?:\xef\xbb\xbf)?(?: + [\x00-\x7f] | + \xc2[\x80-\x8d\x90-\x9f\xa1\xaa\xac\xae-\xb1\xb4\xb6\xb8\xba\xbf] | + \xc3[\x80-\xbf] | + \xc4[\x80-\x93\x96-\xa2\xa4-\xab\xae-\xbf] | + \xc5[\x80-\x8d\x90-\xbe] | + \xc7[\x8d-\x9c\xb5] | + \xcb[\x87\x98-\x9b\x9d] | + \xce[\x84-\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf] | + \xcf[\x80-\x8e] | + \xd0[\x81-\x8c\x8e-\xbf] | + \xd1[\x80-\x8f\x91-\x9f] | + \xe2\x84[\x83\x96\xa2\xab] | + \xe2\x86[\x83\x91-\x93\x96\xa2\xab] | + \xe2\x87[\x83\x91-\x94\x96\xa2\xab] | + \xe2\x88[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0\xa2\xa7-\xac\xb4-\xb5\xbd] | + \xe2\x89[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa6-\xac\xb4-\xb5\xbd] | + \xe2[\x8a\x8c][\x82-\x83\x86-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa5-\xac\xb4-\xb5\xbd] | + \xe2[\x94-\x99][\x81-\x83\x86-\x88\x8b-\x8c\x8f-\x94\x96-\x98\x9a-\x9e\xa0-\xac\xaf-\xb0\xb3-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3\x80[\x81-\x83\x85-\x98\x9a-\x9e\xa0-\xad\xaf-\xb0\xb2-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3[\x81-\x83\xb8-\xbf][\x81-\xbf] | + [\xe5-\xe7][\x80-\xbf][\x81-\xbf] | + \xe8[\x80-\xae\xb0-\xbf][\x81-\xbf] | + \xe9[\x80-\x92\x95-\xb1\xb3-\xbe][\x81-\xbf] | + \xef[\xbc-\xbe][\x81-\xbf] | + )*\z/nx + RegexpShiftjis = /\A(?: + [\x00-\x7f\xa1-\xdf] | + [\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc] + )*\z/nx + RegexpEucjp = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1-\xdf] [\xa1-\xdf] + )*\z/nx + RegexpUtf8 = /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] + )*\z/nx + + # + # kconv + # + def kconv(str, out_code, in_code = AUTO) opt = '-' case in_code - when NKF::JIS + when ::NKF::JIS opt << 'J' - when NKF::EUC + when ::NKF::EUC opt << 'E' - when NKF::SJIS + when ::NKF::SJIS opt << 'S' + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'W' end case out_code - when NKF::JIS + when ::NKF::JIS opt << 'j' - when NKF::EUC + when ::NKF::EUC opt << 'e' - when NKF::SJIS + when ::NKF::SJIS opt << 's' - when NKF::NOCONV + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'w' + when ::NKF::NOCONV return str end opt = '' if opt == '-' - NKF::nkf(opt, str) + ::NKF::nkf(opt, str) end module_function :kconv + # + # Encode to + # + def tojis(str) - NKF::nkf('-j', str) + ::NKF::nkf('-j', str) end module_function :tojis def toeuc(str) - NKF::nkf('-e', str) + ::NKF::nkf('-e', str) end module_function :toeuc def tosjis(str) - NKF::nkf('-s', str) + ::NKF::nkf('-s', str) end module_function :tosjis + def toutf8(str) + ::NKF::nkf('-w', str) + end + module_function :toutf8 + + def toutf16(str) + ::NKF::nkf('-w16', str) + end + module_function :toutf16 + + # + # guess + # + def guess(str) - NKF::guess(str) + ::NKF::guess(str) end module_function :guess + + def guess_old(str) + ::NKF::guess_old(str) + end + module_function :guess_old + + # + # isEncoding + # + + def iseuc(str) + RegexpEucjp.match( str ) + end + module_function :iseuc + + def issjis(str) + RegexpShiftjis.match( str ) + end + module_function :issjis + + def isutf8(str) + RegexpUtf8.match( str ) + end + module_function :isutf8 + end class String def kconv(out_code, in_code=Kconv::AUTO) Kconv::kconv(self, out_code, in_code) end + + # to Encoding def tojis - NKF::nkf('-j', self) + ::NKF::nkf('-j', self) end def toeuc - NKF::nkf('-e', self) + ::NKF::nkf('-e', self) end def tosjis - NKF::nkf('-s', self) + ::NKF::nkf('-s', self) + end + def toutf8 + ::NKF::nkf('-w', self) + end + def toutf16 + ::NKF::nkf('-w16', self) + end + + # is Encoding + def iseuc + Kconv.iseuc( self ) + end + + def issjis + Kconv.issjis( self ) + end + + def isutf8 + Kconv.isutf8( self ) end end diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c index ca6de73e10..6517b3aba1 100644 --- a/ext/nkf/nkf.c +++ b/ext/nkf/nkf.c @@ -1,51 +1,82 @@ +/* + * NKF Module for Ruby base on nkf 2.x + * + * original nkf2.0 is maintained at http://sourceforge.jp/projects/nkf/ + * + */ + +static char *RVersion = "2.0.4.1r1"; + #include "ruby.h" +/* Encoding Constants */ #define _AUTO 0 #define _JIS 1 #define _EUC 2 #define _SJIS 3 #define _BINARY 4 #define _NOCONV 4 +#define _ASCII 5 +/* 0b011x is reserved for UTF-8 Family */ +#define _UTF8 6 +/* 0b10xx is reserved for UTF-16 Family */ +#define _UTF16 8 +/* 0b11xx is reserved for UTF-32 Family */ +#define _UTF32 12 +#define _OTHER 16 #define _UNKNOWN _AUTO +/* Replace nkf's getchar/putchar for variable modification */ +/* we never use getc, ungetc */ + #undef getc #undef ungetc -#define getc(f) (input_ctr=i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- +#define INCSIZE 32 #undef putchar -#define putchar(c) rb_nkf_putchar(c) +#undef TRUE +#undef FALSE +#define putchar(c) rb_nkf_putchar(c) -#define INCSIZE 32 -static int incsize; +/* Input/Output pointers */ -static unsigned char *input, *output; -static int input_ctr, i_len; -static int output_ctr, o_len; +static unsigned char *output; +static unsigned char *input; +static int input_ctr; +static int i_len; +static int output_ctr; +static int o_len; +static int incsize; -static VALUE dst; +static VALUE result; static int rb_nkf_putchar(c) - unsigned int c; + unsigned int c; { if (output_ctr >= o_len) { o_len += incsize; - rb_str_resize(dst, o_len); - output = RSTRING(dst)->ptr; + rb_str_resize(result, o_len); incsize *= 2; + output = RSTRING(result)->ptr; } output[output_ctr++] = c; return c; } +/* Include kanji filter main part */ +/* getchar and putchar will be replaced during inclusion */ + #define PERL_XS 1 -#include "nkf1.7/nkf.c" +#include "nkf-utf8/utf8tbl.c" +#include "nkf-utf8/nkf.c" static VALUE rb_nkf_kconv(obj, opt, src) - VALUE obj, opt, src; + VALUE obj, opt, src; { char *opt_ptr, *opt_end; volatile VALUE v; @@ -58,44 +89,46 @@ rb_nkf_kconv(obj, opt, src) if (*opt_ptr != '-') { continue; } - arguments(opt_ptr); + options(opt_ptr); } incsize = INCSIZE; - input_ctr = 0; + input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; - dst = rb_str_new(0, i_len*3 + 10); - v = dst; + result = rb_str_new(0, i_len*3 + 10); + v = result; output_ctr = 0; - output = RSTRING(dst)->ptr; - o_len = RSTRING(dst)->len; + output = RSTRING(result)->ptr; + o_len = RSTRING(result)->len; *output = '\0'; - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - iso8859_f = FALSE; - } + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); kanji_convert(NULL); - RSTRING(dst)->ptr[output_ctr] = '\0'; - RSTRING(dst)->len = output_ctr; - OBJ_INFECT(dst, src); + RSTRING(result)->ptr[output_ctr] = '\0'; + RSTRING(result)->len = output_ctr; + OBJ_INFECT(result, src); - return dst; + return result; } + /* + * NKF.guess1 + * * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE -rb_nkf_guess(obj, src) - VALUE obj, src; +rb_nkf_guess1(obj, src) + VALUE obj, src; { unsigned char *p; unsigned char *pend; @@ -107,16 +140,16 @@ rb_nkf_guess(obj, src) if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ - p++;\ - if (p==pend) return INT2FIX(_UNKNOWN);\ - sequence_counter++;\ - if (sequence_counter % 2 == 1 && *p != 0xa4)\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ - if (6 <= sequence_counter) {\ - sequence_counter = 0;\ - return INT2FIX(_EUC);\ - }\ -} while (0) + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ + } while (0) if (*p == 0xa4) sequence_counter = 1; @@ -180,19 +213,77 @@ rb_nkf_guess(obj, src) return INT2FIX(_UNKNOWN); } + +/* + * NKF.guess2 + * + * Guess Encoding By NKF2.0 Routine + */ + +static VALUE +rb_nkf_guess2(obj, src) + VALUE obj, src; +{ + int code = _BINARY; + + reinit(); + + input_ctr = 0; + StringValue(src); + input = RSTRING(src)->ptr; + i_len = RSTRING(src)->len; + + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); + + guess_f = TRUE; + kanji_convert( NULL ); + guess_f = FALSE; + + if (!is_inputcode_mixed) { + if (strcmp(input_codename, "") == 0) { + code = _ASCII; + } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { + code = _JIS; + } else if (strcmp(input_codename, "EUC-JP") == 0) { + code = _EUC; + } else if (strcmp(input_codename, "Shift_JIS") == 0) { + code = _SJIS; + } else if (strcmp(input_codename, "UTF-8") == 0) { + code = _UTF8; + } else if (strcmp(input_codename, "UTF-16") == 0) { + code = _UTF16; + } else if (strlen(input_codename) > 0) { + code = _UNKNOWN; + } + } + + return INT2FIX( code ); +} + + +/* Initialize NKF Module */ + void Init_nkf() { - VALUE mKconv = rb_define_module("NKF"); - - rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); - rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); - - rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); - rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); - rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); - rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); - rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); - rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); - rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII)); + rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8)); + rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16)); + rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + rb_define_const(mKconv, "VERSION", rb_str_new2(RVersion)); } diff --git a/ext/nkf/nkf1.7/nkf.c b/ext/nkf/nkf1.7/nkf.c deleted file mode 100644 index 09419f40a7..0000000000 --- a/ext/nkf/nkf1.7/nkf.c +++ /dev/null @@ -1,1900 +0,0 @@ -/** Network Kanji Filter. (PDS Version) -************************************************************************ -** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA) -** $BO"Mm@h!'(B $B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j(B -** $B!J(BE-Mail Address: ichikawa@flab.fujitsu.co.jp$B!K(B -** Copyright (C) 1996,1998 -** $BO"Mm@h!'(B $BN05eBg3X>pJs9)3X2J(B $B2OLn(B $B??<#(B mine/X0208 support -** $B!J(BE-Mail Address: kono@ie.u-ryukyu.ac.jp$B!K(B -** $BO"Mm@h!'(B COW for DOS & Win16 & Win32 & OS/2 -** $B!J(BE-Mail Address: GHG00637@niftyserve.or.p$B!K(B -** $B$3$N%=!<%9$N$$$+$J$kJ#Z$b$7$J$$!"0-$7$+$i$:!#(B -** Everyone is permitted to do anything on this program -** including copying, modifying, improving. -** as long as you don't try to pretend that you wrote it. -** i.e., the above copyright notice has to appear in all copies. -** You don't have to ask before copying or publishing. -** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE. -***********************************************************************/ - -static char *CopyRight = - "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),1998 S. Kono, COW"; -static char *Version = - "1.7"; -static char *Patchlevel = - "0/9711/Shinji Kono"; - -/* -** -** -** -** USAGE: nkf [flags] [file] -** -** Flags: -** b Output is bufferred (DEFAULT) -** u Output is unbufferred -** -** t no operation -** -** j Outout code is JIS 7 bit (DEFAULT SELECT) -** s Output code is MS Kanji (DEFAULT SELECT) -** e Output code is AT&T JIS (DEFAULT SELECT) -** l Output code is JIS 7bit and ISO8859-1 Latin-1 -** -** m MIME conversion for ISO-2022-JP -** i_ Output sequence to designate JIS-kanji (DEFAULT_J) -** o_ Output sequence to designate single-byte roman characters (DEFAULT_R) -** -** r {de/en}crypt ROT13/47 -** -** v display Version -** -** T Text mode output (for MS-DOS) -** -** x Do not convert X0201 kana into X0208 -** Z Convert X0208 alphabet to ASCII -** -** f60 fold option -** -** m MIME decode -** B try to fix broken JIS, missing Escape -** B[1-9] broken level -** -** O Output to 'nkf.out' file -** d Delete \r in line feed -** c Add \r in line feed -**/ -/******************************/ -/* $B%G%U%)%k%H$N=PNO%3!<%IA*Br(B */ -/* Select DEFAULT_CODE */ -#define DEFAULT_CODE_JIS -/* #define DEFAULT_CODE_SJIS */ -/* #define DEFAULT_CODE_EUC */ -/******************************/ - -#if (defined(__TURBOC__) || defined(LSI_C)) && !defined(MSDOS) -#define MSDOS -#endif - -#ifndef PERL_XS -#include -#endif - -#if defined(MSDOS) || defined(__OS2__) -#include -#include -#include -#endif - -#ifdef MSDOS -#ifdef LSI_C -#define setbinmode(fp) fsetbin(fp) -#else /* Microsoft C, Turbo C */ -#define setbinmode(fp) setmode(fileno(fp), O_BINARY) -#endif -#else /* UNIX,OS/2 */ -#define setbinmode(fp) -#endif - -#ifdef _IOFBF /* SysV and MSDOS */ -#define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size) -#else /* BSD */ -#define setvbuffer(fp, buf, size) setbuffer(fp, buf, size) -#endif - -/*Borland C++ 4.5 EasyWin*/ -#if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */ -#define EASYWIN -#include -#endif - -#define FALSE 0 -#define TRUE 1 - -/* state of output_mode and input_mode */ - -#define ASCII 0 -#define X0208 1 -#define X0201 2 -#define NO_X0201 3 -#define JIS_INPUT 4 -#define SJIS_INPUT 5 -#define LATIN1_INPUT 6 -#define FIXED_MIME 7 -#define DOUBLE_SPACE -2 - -#define NL 0x0a -#define ESC 0x1b -#define SPACE 0x20 -#define AT 0x40 -#define SSP 0xa0 -#define DEL 0x7f -#define SI 0x0f -#define SO 0x0e -#define SSO 0x8e - -#define HOLD_SIZE 32 -#define IOBUF_SIZE 16384 - -#define DEFAULT_J 'B' -#define DEFAULT_R 'B' - -#define SJ0162 0x00e1 /* 01 - 62 ku offset */ -#define SJ6394 0x0161 /* 63 - 94 ku offset */ - - -/* MIME preprocessor */ - -#undef STRICT_MIME /* do stupid strict mime integrity check */ -#define GETC(p) ((!mime_mode)?getc(p):mime_getc(p)) -#define UNGETC(c,p) ((!mime_mode)?ungetc(c,p):mime_ungetc(c)) - - -#ifdef EASYWIN /*Easy Win */ -extern POINT _BufferSize; -#endif - -/* function prototype */ - -#ifndef _ -# ifdef __STDC__ -# define _(args) args -# else -# define _(args) () -# endif -#endif - -#ifndef PERL_XS -static void noconvert _((FILE *f)); -static int mime_integrity _((FILE *f,unsigned char *p)); -static int usage _((void)); -static char stdibuf[IOBUF_SIZE]; -static char stdobuf[IOBUF_SIZE]; -static unsigned int mime_input = 0; /* undecoded */ -static int end_check; -#endif - -static void kanji_convert _((FILE *f)); -static void h_conv _((FILE *f,int c2,int c1)); -static int push_hold_buf _((int c2,int c1)); -static void s_iconv _((int c2,int c1)); -static void e_oconv _((int c2,int c1)); -static void s_oconv _((int c2,int c1)); -static void j_oconv _((int c2,int c1)); -static int line_fold _((int c2,int c1)); -static int pre_convert _((int c1,int c2)); -static int mime_begin _((FILE *f)); -static int mime_getc _((FILE *f)); -static int mime_ungetc _((unsigned int c)); -static int base64decode _((int c)); -static void arguments _((char *c)); -static void reinit _((void)); - -/* buffers */ - -static unsigned char hold_buf[HOLD_SIZE*2]; -static int hold_count; - -/* MIME preprocessor fifo */ - -#define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ -#define MIME_BUF_MASK (MIME_BUF_SIZE-1) -#define Fifo(n) mime_buf[(n)&MIME_BUF_MASK] -static unsigned char mime_buf[MIME_BUF_SIZE]; -static unsigned int mime_top = 0; -static unsigned int mime_last = 0; /* decoded */ - -/* flags */ -static int unbuf_f = FALSE; -static int estab_f = FALSE; -static int nop_f = FALSE; -static int binmode_f = TRUE; /* binary mode */ -static int rot_f = FALSE; /* rot14/43 mode */ -static int input_f = FALSE; /* non fixed input code */ -static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */ -static int mime_f = TRUE; /* convert MIME B base64 or Q */ -static int mimebuf_f = FALSE; /* MIME buffered input */ -static int broken_f = FALSE; /* convert ESC-less broken JIS */ -static int iso8859_f = FALSE; /* ISO8859 through */ -#if defined(MSDOS) || defined(__OS2__) -static int x0201_f = TRUE; /* Assume JISX0201 kana */ -#else -static int x0201_f = NO_X0201; /* Assume NO JISX0201 */ -#endif - -/* X0208 -> ASCII converter */ - -static int c1_return; - -/* fold parameter */ -static int line = 0; /* chars in line */ -static int prev = 0; -static int fold_f = FALSE; -static int fold_len = 0; - -/* options */ -static char kanji_intro = DEFAULT_J, - ascii_intro = DEFAULT_R; - -/* Folding */ - -int line_fold(); -#define FOLD_MARGIN 10 -#define DEFAULT_FOLD 60 - -/* converters */ - -#ifdef DEFAULT_CODE_JIS -# define DEFAULT_CONV j_oconv -#endif -#ifdef DEFAULT_CODE_SJIS -# define DEFAULT_CONV s_oconv -#endif -#ifdef DEFAULT_CODE_EUC -# define DEFAULT_CONV e_oconv -#endif - -static void (*iconv) _((int c2,int c1)); - /* s_iconv or oconv */ -static void (*oconv) _((int c2,int c1)) = DEFAULT_CONV; - /* [ejs]_oconv */ - -/* Global states */ -static int output_mode = ASCII, /* output kanji mode */ - input_mode = ASCII, /* input kanji mode */ - shift_mode = FALSE; /* TRUE shift out, or X0201 */ -static int mime_mode = FALSE; /* MIME mode B base64, Q hex */ - -/* X0201 / X0208 conversion tables */ - -/* X0201 kana conversion table */ -/* 90-9F A0-DF */ -unsigned char cv[]= { -0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57, -0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21, -0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29, -0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43, -0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26, -0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d, -0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35, -0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d, -0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46, -0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c, -0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52, -0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e, -0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62, -0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69, -0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d, -0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c, -0x00,0x00}; - - -/* X0201 kana conversion table for daguten */ -/* 90-9F A0-DF */ -unsigned char dv[]= { -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e, -0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36, -0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e, -0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47, -0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53, -0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00}; - -/* X0201 kana conversion table for han-daguten */ -/* 90-9F A0-DF */ -unsigned char ev[]= { -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54, -0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00}; - - -/* X0208 kigou conversion table */ -/* 0x8140 - 0x819e */ -unsigned char fv[] = { - -0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a, -0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00, -0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f, -0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27, -0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d, -0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00, -0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -} ; - - -static int file_out = FALSE; -static int add_cr = FALSE; -static int del_cr = FALSE; - -#ifndef PERL_XS -int -main(argc, argv) - int argc; - char **argv; -{ - FILE *fin; - char *cp; - -#ifdef EASYWIN /*Easy Win */ - _BufferSize.y = 400;/*Set Scroll Buffer Size*/ -#endif - - for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { - cp = *argv; - arguments(cp); - } - - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - fprintf(stderr,"Mixed ISO8859/JISX0201/SJIS/EUC output is not allowed.\n"); - exit(1); - } - - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","wb",stdout) == NULL) - return (-1); -#else - setbinmode(stdout); -#endif - - if(unbuf_f) - setbuf(stdout, (char *) NULL); - else - setvbuffer(stdout, stdobuf, IOBUF_SIZE); - - if(argc == 0) { - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","rb",stdin) == NULL) return (-1); -#else - setbinmode(stdin); -#endif - setvbuffer(stdin, stdibuf, IOBUF_SIZE); - if(nop_f) - noconvert(stdin); - else - kanji_convert(stdin); - } else { - while (argc--) { - if((fin = fopen(*argv++, "r")) == NULL) { - perror(*--argv); - return(-1); - } else { -/* reopen file for stdout */ - if(file_out == TRUE){ - if(argc == 1 ) { - if(freopen(*argv++, "w", stdout) == NULL) { - perror(*--argv); - return (-1); - } - argc--; - } else { - if(freopen("nkf.out", "w", stdout) == NULL) { - perror(*--argv); - return (-1); - } - } - if(binmode_f == TRUE) { -#ifdef __OS2__ - if(freopen("","wb",stdout) == NULL) - return (-1); -#else - setbinmode(stdout); -#endif - } - } - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","rb",fin) == NULL) - return (-1); -#else - setbinmode(fin); -#endif - setvbuffer(fin, stdibuf, IOBUF_SIZE); - if(nop_f) - noconvert(fin); - else - kanji_convert(fin); - fclose(fin); - } - } - } -#ifdef EASYWIN /*Easy Win */ - if(file_out == FALSE) - scanf("%d",&end_check); - else - fclose(stdout); -#else /* for Other OS */ - if(file_out == TRUE) - fclose(stdout); -#endif - return (0); -} -#endif - -static void -arguments(cp) - char *cp; -{ - while (*cp) { - switch (*cp++) { - case 'b': /* buffered mode */ - unbuf_f = FALSE; - continue; - case 'u': /* non bufferd mode */ - unbuf_f = TRUE; - continue; - case 't': /* transparent mode */ - nop_f = TRUE; - continue; - case 'j': /* JIS output */ - case 'n': - oconv = j_oconv; - continue; - case 'e': /* AT&T EUC output */ - oconv = e_oconv; - continue; - case 's': /* SJIS output */ - oconv = s_oconv; - continue; - case 'l': /* ISO8859 Latin-1 support, no conversion */ - iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ - input_f = LATIN1_INPUT; - continue; - case 'i': /* Kanji IN ESC-$-@/B */ - if(*cp=='@'||*cp=='B') - kanji_intro = *cp++; - continue; - case 'o': /* ASCII IN ESC-(-J/B */ - if(*cp=='J'||*cp=='B'||*cp=='H') - ascii_intro = *cp++; - continue; - case 'r': - rot_f = TRUE; - continue; -#if defined(MSDOS) || defined(__OS2__) - case 'T': - binmode_f = FALSE; - continue; -#endif -#ifndef PERL_XS - case 'v': - usage(); - exit(1); - break; -#endif - /* Input code assumption */ - case 'J': /* JIS input */ - case 'E': /* AT&T EUC input */ - input_f = JIS_INPUT; - continue; - case 'S': /* MS Kanji input */ - input_f = SJIS_INPUT; - if(x0201_f==NO_X0201) x0201_f=TRUE; - continue; - case 'Z': /* Convert X0208 alphabet to asii */ - /* bit:0 Convert X0208 - bit:1 Convert Kankaku to one space - bit:2 Convert Kankaku to two spaces - */ - if('9'>= *cp && *cp>='0') - alpha_f |= 1<<(*cp++ -'0'); - else - alpha_f |= TRUE; - continue; - case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ - x0201_f = FALSE; /* No X0201->X0208 conversion */ - /* accept X0201 - ESC-(-I in JIS, EUC, MS Kanji - SI/SO in JIS, EUC, MS Kanji - SSO in EUC, JIS, not in MS Kanji - MS Kanji (0xa0-0xdf) - output X0201 - ESC-(-I in JIS (0x20-0x5f) - SSO in EUC (0xa0-0xdf) - 0xa0-0xd in MS Kanji (0xa0-0xdf) - */ - continue; - case 'X': /* Assume X0201 kana */ - /* Default value is NO_X0201 for EUC/MS-Kanji mix */ - x0201_f = TRUE; - continue; - case 'f': /* folding -f60 or -f */ - fold_f = TRUE; - fold_len = atoi(cp); - if(!(0= *cp && *cp>='0') - broken_f |= 1<<(*cp++ -'0'); - else - broken_f |= TRUE; - continue; -#ifndef PERL_XS - case 'O':/* for Output file */ - file_out = TRUE; - continue; -#endif - case 'c':/* add cr code */ - add_cr = TRUE; - continue; - case 'd':/* delete cr code */ - del_cr = TRUE; - continue; - default: - /* bogus option but ignored */ - continue; - } - } -} - -#ifndef PERL_XS -static void -noconvert(f) - FILE *f; -{ - int c; - - while ((c = getc(f)) != EOF) - putchar(c); -} -#endif - - -static void -kanji_convert(f) - FILE *f; -{ - int c1, c2; - - c2 = 0; - - if(input_f == JIS_INPUT || input_f == LATIN1_INPUT) { - estab_f = TRUE; iconv = oconv; - } else if(input_f == SJIS_INPUT) { - estab_f = TRUE; iconv = s_iconv; - } else { - estab_f = FALSE; iconv = oconv; - } - input_mode = ASCII; - output_mode = ASCII; - shift_mode = FALSE; - -#define NEXT continue /* no output, get next */ -#define SEND ; /* output c1 and c2, get next */ -#define LAST break /* end of loop, go closing */ - - while ((c1 = GETC(f)) != EOF) { - if(c2) { - /* second byte */ - if(c2 > DEL) { - /* in case of 8th bit is on */ - if(!estab_f) { - /* in case of not established yet */ - if(c1 > SSP) { - /* It is still ambiguious */ - h_conv(f, c2, c1); - c2 = 0; - NEXT; - } else if(c1 < AT) { - /* ignore bogus code */ - c2 = 0; - NEXT; - } else { - /* established */ - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - SEND; - } - } else - /* in case of already established */ - if(c1 < AT) { - /* ignore bogus code */ - c2 = 0; - NEXT; - } else - SEND; - } else - /* 7 bit code */ - /* it might be kanji shitfted */ - if((c1 == DEL) || (c1 <= SPACE)) { - /* ignore bogus first code */ - c2 = 0; - NEXT; - } else - SEND; - } else { - /* first byte */ - if(c1 > DEL) { - /* 8 bit code */ - if(!estab_f && !iso8859_f) { - /* not established yet */ - if(c1 < SSP) { - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - } else if(c1 < 0xe0) { - /* it seems to be EUC */ - estab_f = TRUE; - iconv = oconv; - } else { - /* still ambiguious */ - } - c2 = c1; - NEXT; - } else { /* estab_f==TRUE */ - if(iso8859_f) { - SEND; - } else if(SSP<=c1 && c1<0xe0 && iconv == s_iconv) { - /* SJIS X0201 Case... */ - /* This is too arrogant, but ... */ - if(x0201_f==NO_X0201) { - iconv = oconv; - c2 = c1; - NEXT; - } else - if(x0201_f) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - /* look ahead for X0201/X0208conversion */ - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } else if(c2==(0xde)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } - UNGETC(c2,f); c2 = 0; - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1==SSO && iconv != s_iconv) { - /* EUC X0201 Case */ - /* This is too arrogant - if(x0201_f == NO_X0201) { - estab_f = FALSE; - c2 = 0; - NEXT; - } */ - c1 = GETC(f); /* skip SSO */ - euc_1byte_check: - if(x0201_f && SSP<=c1 && c1<0xe0) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } - /* forward lookup $BByE@(B/$BH>ByE@(B */ - if(c2 != SSO) { - UNGETC(c2,f); c2 = 0; - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - (*oconv)(0,SSO); - LAST; - } else if(c2==(0xde)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - /* we have to check this c2 */ - /* and no way to push back SSO */ - c1 = c2; c2 = 0; - goto euc_1byte_check; - } - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1 < SSP && iconv != s_iconv) { - /* strange code in EUC */ - iconv = s_iconv; /* try SJIS */ - c2 = c1; - NEXT; - } else { - /* already established */ - c2 = c1; - NEXT; - } - } - } else if((c1 > SPACE) && (c1 != DEL)) { - /* in case of Roman characters */ - if(shift_mode) { - c1 |= 0x80; - /* output 1 shifted byte */ - if(x0201_f && (!iso8859_f||input_mode==X0201) && - SSP<=c1 && c1<0xe0 ) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } else if(c2==(0xde&0x7f)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf&0x7f)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } - UNGETC(c2,f); c2 = 0; - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1 == '(' && broken_f && input_mode == X0208 - && !mime_mode ) { - /* Try to recover missing escape */ - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, '('); - LAST; - } else { - if(c1 == 'B' || c1 == 'J' || c1 == 'H') { - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, '('); - /* do not modify various input_mode */ - /* It can be vt100 sequence */ - SEND; - } - } - } else if(input_mode == X0208) { - /* in case of Kanji shifted */ - c2 = c1; - NEXT; - /* goto next_byte */ - } else if(c1 == '=' && mime_f && !mime_mode ) { - if((c1 = getc(f)) == EOF) { - (*oconv)(0, '='); - LAST; - } else if(c1 == '?') { - /* =? is mime conversiooon start sequence */ - if(mime_begin(f) == EOF) /* check in detail */ - LAST; - else - NEXT; - } else { - (*oconv)(0, '='); - ungetc(c1,f); - NEXT; - } - } else if(c1 == '$' && broken_f && !mime_mode) { - /* try to recover missing escape */ - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, '$'); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* in case of Kanji in ESC sequence */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - /* sorry */ - (*oconv)(0, '$'); - (*oconv)(0, c1); - NEXT; - } - } else - SEND; - } else if(c1 == SI) { - shift_mode = FALSE; - NEXT; - } else if(c1 == SO) { - shift_mode = TRUE; - NEXT; - } else if(c1 == ESC) { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - LAST; - } else if(c1 == '$') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else if(c1 == '(') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - (*oconv)(0, c1); - NEXT; - } - } else if(broken_f&0x2) { - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, c1); - NEXT; - } - } else if(c1 == '(') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '('); - LAST; - } else { - if(c1 == 'I') { - /* This is X0201 kana introduction */ - input_mode = X0201; shift_mode = X0201; - NEXT; - } else if(c1 == 'B' || c1 == 'J' || c1 == 'H') { - /* This is X0208 kanji introduction */ - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else if(broken_f&0x2) { - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '('); - /* maintain various input_mode here */ - SEND; - } - } - } else { - /* lonely ESC */ - (*oconv)(0, ESC); - SEND; - } - } else if(c1 == NL && broken_f&4) { - input_mode = ASCII; - SEND; - } else - SEND; - } - /* send: */ - if(input_mode == X0208) - (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ - else - (*iconv)(c2, c1); /* can be EUC/SJIS */ - c2 = 0; - continue; - /* goto next_word */ - } - - /* epilogue */ - (*iconv)(EOF, 0); -} - - - - -static void -h_conv(f, c2, c1) - FILE *f; - int c1, c2; -{ - int wc; - - - /** it must NOT be in the kanji shifte sequence */ - /** it must NOT be written in JIS7 */ - /** and it must be after 2 byte 8bit code */ - - hold_count = 0; - push_hold_buf(c2, c1); - c2 = 0; - - while ((c1 = GETC(f)) != EOF) { - if(c2) { - /* second byte */ - if(!estab_f) { - /* not established */ - if(c1 > SSP) { - /* it is still ambiguious yet */ - SEND; - } else if(c1 < AT) { - /* ignore bogus first byte */ - c2 = 0; - SEND; - } else { - /* now established */ - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - SEND; - } - } else - SEND; - } else { - /* First byte */ - if(c1 > DEL) { - /* 8th bit is on */ - if(c1 < SSP) { - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - } else if(c1 < 0xe0) { - /* it seems to be EUC */ - estab_f = TRUE; - iconv = oconv; - } else { - /* still ambiguious */ - } - c2 = c1; - NEXT; - } else - /* 7 bit code , then send without any process */ - SEND; - } - /* send: */ - if((push_hold_buf(c2, c1) == EOF) || estab_f) - break; - c2 = 0; - continue; - } - - /** now, - ** 1) EOF is detected, or - ** 2) Code is established, or - ** 3) Buffer is FULL (but last word is pushed) - ** - ** in 1) and 3) cases, we continue to use - ** Kanji codes by oconv and leave estab_f unchanged. - **/ - - for (wc = 0; wc < hold_count; wc += 2) { - c2 = hold_buf[wc]; - c1 = hold_buf[wc+1]; - (*iconv)(c2, c1); - } - return; -} - - - -static int -push_hold_buf(c2, c1) - int c2, c1; -{ - if(hold_count >= HOLD_SIZE*2) - return (EOF); - hold_buf[hold_count++] = c2; - hold_buf[hold_count++] = c1; - return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); -} - - -static void -s_iconv(c2, c1) - int c2, - c1; -{ - if((c2 == EOF) || (c2 == 0)) { - /* NOP */ - } else { - c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394); - if(c1 < 0x9f) - c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f); - else { - c1 = c1 - 0x7e; - c2++; - } - } - (*oconv)(c2, c1); -} - - -static void -e_oconv(c2, c1) - int c2, c1; -{ - c2 = pre_convert(c1,c2); c1 = c1_return; - if(fold_f) { - switch(line_fold(c2,c1)) { - case '\n': - if(add_cr == TRUE) { - putchar('\r'); - c1 = '\n'; - } - putchar('\n'); - break; - case 0: return; - case '\r': - c1 = '\n'; c2 = 0; - break; - case '\t': - case ' ': - c1 = ' '; c2 = 0; - break; - } - } - if(c2==DOUBLE_SPACE) { - putchar(' '); putchar(' '); - return; - } - if(c2 == EOF) - return; - else if(c2 == 0 && (c1&0x80)) { - putchar(SSO); putchar(c1); - } else if(c2 == 0) { - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else { - if((c1<0x20 || 0x7e> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1))); - putchar((c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e))); - } - return; -} - - -static void -j_oconv(c2, c1) - int c2, c1; -{ - c2 = pre_convert(c1,c2); c1 = c1_return; - if(fold_f) { - switch(line_fold(c2,c1)) { - case '\n': - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - } - if(add_cr == TRUE) { - putchar('\r'); - c1 = '\n'; - } - putchar('\n'); - output_mode = ASCII; - break; - case '\r': - c1 = '\n'; c2 = 0; - break; - case '\t': - case ' ': - c1 = ' '; c2 = 0; - break; - case 0: return; - } - } - if(c2 == EOF) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - } - } else if(c2 == 0 && (c1 & 0x80)) { - if(input_mode==X0201 || !iso8859_f) { - if(output_mode!=X0201) { - putchar(ESC); - putchar('('); - putchar('I'); - output_mode = X0201; - } - c1 &= 0x7f; - } else { - /* iso8859 introduction, or 8th bit on */ - /* Can we convert in 7bit form using ESC-'-'-A ? - Is this popular? */ - } - putchar(c1); - } else if(c2 == 0) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - output_mode = ASCII; - } - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else if(c2 == DOUBLE_SPACE) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - output_mode = ASCII; - } - putchar(' '); - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else { - if(output_mode != X0208) { - putchar(ESC); - putchar('$'); - putchar(kanji_intro); - output_mode = X0208; - } - if(c1<0x20 || 0x7e0x80 Japanese (X0208/X0201) - <0x80 ASCII - \n new line - ' ' space - - This fold algorthm does not preserve heading space in a line. - This is the main difference from fmt. -*/ - -static int -line_fold(c2,c1) -int c2,c1; -{ - int prev0; - if(c1=='\r') - return 0; /* ignore cr */ - if(c1== 8) { - if(line>0) line--; - return 1; - } - if(c2==EOF && line != 0) /* close open last line */ - return '\n'; - /* new line */ - if(c1=='\n') { - if(prev == c1) { /* duplicate newline */ - if(line) { - line = 0; - return '\n'; /* output two newline */ - } else { - line = 0; - return 1; - } - } else { - if(prev&0x80) { /* Japanese? */ - prev = c1; - return 0; /* ignore given single newline */ - } else if(prev==' ') { - return 0; - } else { - prev = c1; - if(++line<=fold_len) - return ' '; - else { - line = 0; - return '\r'; /* fold and output nothing */ - } - } - } - } - if(c1=='\f') { - prev = '\n'; - if(line==0) - return 1; - line = 0; - return '\n'; /* output newline and clear */ - } - /* X0208 kankaku or ascii space */ - if( (c2==0&&c1==' ')|| - (c2==0&&c1=='\t')|| - (c2==DOUBLE_SPACE)|| - (c2=='!'&& c1=='!')) { - if(prev == ' ') { - return 0; /* remove duplicate spaces */ - } - prev = ' '; - if(++line<=fold_len) - return ' '; /* output ASCII space only */ - else { - prev = ' '; line = 0; - return '\r'; /* fold and output nothing */ - } - } - prev0 = prev; /* we still need this one... , but almost done */ - prev = c1; - if(c2 || (SSP<=c1 && c1<=0xdf)) - prev |= 0x80; /* this is Japanese */ - line += (c2==0)?1:2; - if(line<=fold_len) { /* normal case */ - return 1; - } - if(line>=fold_len+FOLD_MARGIN) { /* too many kinsou suspension */ - line = (c2==0)?1:2; - return '\n'; /* We can't wait, do fold now */ - } - /* simple kinsoku rules return 1 means no folding */ - if(c2==0) { - if(c1==0xde) return 1; /* $B!+(B*/ - if(c1==0xdf) return 1; /* $B!,(B*/ - if(c1==0xa4) return 1; /* $B!#(B*/ - if(c1==0xa3) return 1; /* $B!$(B*/ - if(c1==0xa1) return 1; /* $B!W(B*/ - if(c1==0xb0) return 1; /* - */ - if(SSP<=c1 && c1<=0xdf) { /* X0201 */ - line = 1; - return '\n';/* add one new line before this character */ - } - /* fold point in ASCII { [ ( */ - if(( c1!=')'&& - c1!=']'&& - c1!='}'&& - c1!='.'&& - c1!=','&& - c1!='!'&& - c1!='?'&& - c1!='/'&& - c1!=':'&& - c1!=';')&& - ((prev0=='\n')|| (prev0==' ')|| /* ignored new line */ - (prev0&0x80)) /* X0208 - ASCII */ - ) { - line = 1; - return '\n';/* add one new line before this character */ - } - return 1; /* default no fold in ASCII */ - } else { - if(c2=='!') { - if(c1=='"') return 1; /* $B!"(B */ - if(c1=='#') return 1; /* $B!#(B */ - if(c1=='$') return 1; /* $B!$(B */ - if(c1=='%') return 1; /* $B!%(B */ - if(c1=='\'') return 1; /* $B!\(B */ - if(c1=='(') return 1; /* $B!((B */ - if(c1==')') return 1; /* $B!)(B */ - if(c1=='*') return 1; /* $B!*(B */ - if(c1=='+') return 1; /* $B!+(B */ - if(c1==',') return 1; /* $B!,(B */ - } - line = 2; - return '\n'; /* add one new line before this character */ - } -} - -static int -pre_convert(c1,c2) - int c1,c2; -{ - if(c2) c1 &= 0x7f; - c1_return = c1; - if(c2==EOF) return c2; - c2 &= 0x7f; - if(rot_f) { - if(c2) { - c1 = rot47(c1); - c2 = rot47(c2); - } else { - if(!(c1 & 0x80)) - c1 = rot13(c1); - } - c1_return = c1; - } - /* JISX0208 Alphabet */ - if(alpha_f && c2 == 0x23 ) return 0; - /* JISX0208 Kigou */ - if(alpha_f && c2 == 0x21 ) { - if(0x21==c1) { - if(alpha_f&0x2) { - c1_return = ' '; - return 0; - } else if(alpha_f&0x4) { - c1_return = ' '; - return DOUBLE_SPACE; - } else { - return c2; - } - } else if(0x20' ';i++) { /* start at =? */ - if( ((((r[i] = c1 = getc(f))==EOF) || nkf_toupper(c1) != p[i] ) { - /* pattern fails, try next one */ - q = p; - while (p = mime_pattern[++j]) { - for(k=2;k i */ - if(p[k]!=q[k]) break; - if(k==i && nkf_toupper(c1)==p[k]) break; - } - if(p) continue; /* found next one, continue */ - /* all fails, output from recovery buffer */ - ungetc(c1,f); - for(j=0;j> 4) & 0x03); - if(c2 != '=') { - Fifo(mime_last++) = cc; - cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); - if(c3 != '=') { - Fifo(mime_last++) = cc; - cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); - if(c4 != '=') - Fifo(mime_last++) = cc; - } - } else { - return c1; - } - return Fifo(mime_top++); -} - -static int -mime_ungetc(c) - unsigned int c; -{ - Fifo(mime_last++) = c; - return c; -} - -#ifdef STRICT_MIME -int -mime_integrity(f,p) - FILE *f; - unsigned char *p; -{ - int c,d; - unsigned int q; - /* In buffered mode, read until =? or NL or buffer fffull - */ - mime_input = mime_top; - mime_last = mime_top; - while(*p) Fifo(mime_input++) = *p++; - d = 0; - q = mime_input; - while((c=getc(f))!=EOF) { - if(((mime_input-mime_top)&MIME_BUF_MASK)==0) break; - if(c=='=' && d=='?') { - /* checked. skip header, start decode */ - Fifo(mime_input++) = c; - mime_input = q; - return 1; - } - if(!( (c=='+'||c=='/'|| c=='=' || c=='?' || - ('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')))) - break; - /* Should we check length mod 4? */ - Fifo(mime_input++) = c; - d=c; - } - /* In case of Incomplete MIME, no MIME decode */ - Fifo(mime_input++) = c; - mime_last = mime_input; /* point undecoded buffer */ - mime_mode = 1; /* no decode on Fifo last in mime_getc */ - return 1; -} -#endif - -static int -base64decode(c) - int c; -{ - int i; - if(c > '@') - if(c < '[') - i = c - 'A'; /* A..Z 0-25 */ - else - i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ - else if(c > '/') - i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ - else if(c == '+') - i = '>' /* 62 */ ; /* + 62 */ - else - i = '?' /* 63 */ ; /* / 63 */ - return (i); -} - -static void -reinit() -{ - unbuf_f = FALSE; - estab_f = FALSE; - nop_f = FALSE; - binmode_f = TRUE; - rot_f = FALSE; - input_f = FALSE; - alpha_f = FALSE; - mime_f = TRUE; - mimebuf_f = FALSE; - broken_f = FALSE; - iso8859_f = FALSE; - x0201_f = TRUE; - x0201_f = NO_X0201; - fold_f = FALSE; - kanji_intro = DEFAULT_J; - ascii_intro = DEFAULT_R; - oconv = DEFAULT_CONV; - output_mode = ASCII; - input_mode = ASCII; - shift_mode = FALSE; - mime_mode = FALSE; - file_out = FALSE; - add_cr = FALSE; - del_cr = FALSE; - line = 0; -} - -#ifndef PERL_XS -int -usage() -{ - fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"); - fprintf(stderr,"Flags:\n"); - fprintf(stderr,"b,u Output is bufferred (DEFAULT),Output is unbufferred\n"); -#ifdef DEFAULT_CODE_SJIS - fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC)\n"); -#endif -#ifdef DEFAULT_CODE_JIS - fprintf(stderr,"j,s,e Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC)\n"); -#endif -#ifdef DEFAULT_CODE_EUC - fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT)\n"); -#endif - fprintf(stderr,"J,S,E Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC)\n"); - fprintf(stderr,"t no conversion\n"); - fprintf(stderr,"i_ Output sequence to designate JIS-kanji (DEFAULT B)\n"); - fprintf(stderr,"o_ Output sequence to designate ASCII (DEFAULT B)\n"); - fprintf(stderr,"r {de/en}crypt ROT13/47\n"); - fprintf(stderr,"v Show this usage\n"); - fprintf(stderr,"m[BQ0] MIME decode [B:base64,Q:quoted,0:no decode]\n"); - fprintf(stderr,"l ISO8859-1 (Latin-1) support\n"); - fprintf(stderr,"f Folding: -f60 or -f\n"); - fprintf(stderr,"Z[0-2] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces\n"); - fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"); - fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"); -#ifdef MSDOS - fprintf(stderr,"T Text mode output\n"); -#endif - fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n"); - fprintf(stderr,"d,c Delete \\r in line feed, Add \\r in line feed\n"); - fprintf(stderr,"Network Kanji Filter Version %s (%s) " -#if defined(MSDOS) && !defined(_Windows) - "for DOS" -#endif -#if !defined(__WIN32__) && defined(_Windows) - "for Win16" -#endif -#if defined(__WIN32__) && defined(_Windows) - "for Win32" -#endif -#ifdef __OS2__ - "for OS/2" -#endif - ,Version,Patchlevel); - fprintf(stderr,"\n%s\n",CopyRight); - return 0; -} -#endif - -/** - ** $B%Q%C%A@):n - ** ohta@src.ricoh.co.jp (Junn Ohta) - ** inouet@strl.nhk.or.jp (Tomoyuki Inoue) - ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama) - ** Kimihiko Sato - ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe) - ** kono@ie.u-ryukyu.ac.jp (Shinji Kono) - ** GHG00637@nifty-serve.or.jp (COW) - ** - ** $B:G=*99?7F|(B - ** 1998.11.7 - **/ - -/* end */ diff --git a/ext/nkf/test.rb b/ext/nkf/test.rb index 4519f8ba7e..7a2390d649 100644 --- a/ext/nkf/test.rb +++ b/ext/nkf/test.rb @@ -1,3 +1,19 @@ +#!/usr/local/bin/ruby +# +# nkf test program for nkf 1.7 +# Shinji KONO +# Sun Aug 18 12:25:40 JST 1996 +# Sun Nov 8 00:16:06 JST 1998 +# +# This is useful when you add new patch on nkf. +# Since this test is too strict, faileurs may not mean +# wrong conversion. +# +# nkf 1.5 differs on MIME decoding +# nkf 1.4 passes Basic Conversion tests +# nkf PDS version passes Basic Conversion tests using "nkf -iB -oB " +# + $counter = 0 def result(result, message = nil) $counter += 1 @@ -49,41 +65,150 @@ end $detail = false -def test(opt, input, expect) +def test(opt, input, expects) print "\nINPUT:\n", input if $detail - print "\nEXPECT:\n", expect if $detail + print "\nEXPECT:\n", expects.to_s if $detail result = nkf(opt, input) print "\nGOT:\n", result if $detail - print result == expect ? "Ok\n" : "Fail\n" - return result + expects.each do |e| + if result == e then + puts "Ok" + return result + end + end + puts "Fail" end + +example = Hash.new + # Basic Conversion -print "\nBasic Conversion test\n\n" +print "\nBasic Conversion test\n\n"; + +# I gave up simple literal quote because there are big difference +# on perl4 and perl5 on literal quote. Of course we cannot use +# jperl. -example = {} example['jis'] = <<'eofeof'.unpack('u')[0] M1FERED"6GIAR(%-E8V]N9"!3=&%G92"8I9=Y($AI M#28./ >@Y*#DR!+:6=O=2"!18&'@D^"8(._@]:$081@A+X* eofeof -#' example['euc'] = <<'eofeof'.unpack('u')[0] M1FERI?*E\R!+:6=O=2"AIJ'GH["CP:;!IMBGHJ?!J,`* eofeof -#' + +example['utf'] = <<'eofeof'.unpack('u')[0] +M1FERL)C^.7S)AJ"0D* +eofeof + +example['euc1'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['utf1'] = <<'eofeof'.unpack('u')[0] +AZ+J%Z:N/Z8JM"N>VNNFZEPKIM(D)Y+B*Z:"8Y+J8"0D* +eofeof + +example['jis2'] = <<'eofeof'.unpack('u')[0] ++&R1".EA&(QLH0@H` +eofeof + +example['sjis2'] = <<'eofeof'.unpack('u')[0] +%C=:3H0H` +eofeof + +example['euc2'] = <<'eofeof'.unpack('u')[0] +%NMC&HPH` +eofeof + +example['utf2'] = <<'eofeof'.unpack('u')[0] +'YI:.Z)>D"@`` +eofeof + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis'],[example['jis']]); +print "JIS to SJIS... ";test('-s',example['jis'],[example['sjis']]); +print "JIS to EUC ... ";test('-e',example['jis'],[example['euc']]); +print "JIS to UTF8... ";test('-w',example['jis'],[example['utf']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis'],[example['jis']]); +print "SJIS to SJIS... ";test('-s',example['sjis'],[example['sjis']]); +print "SJIS to EUC ... ";test('-e',example['sjis'],[example['euc']]); +print "SJIS to UTF8... ";test('-w',example['sjis'],[example['utf']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc'],[example['jis']]); +print "EUC to SJIS... ";test('-s',example['euc'],[example['sjis']]); +print "EUC to EUC ... ";test('-e',example['euc'],[example['euc']]); +print "EUC to UTF8... ";test('-w',example['euc'],[example['utf']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf'],[example['jis']]); +print "UTF8 to SJIS... ";test('-s',example['utf'],[example['sjis']]); +print "UTF8 to EUC ... ";test('-e',example['utf'],[example['euc']]); +print "UTF8 to UTF8... ";test('-w',example['utf'],[example['utf']]); + + + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis1'],[example['jis1']]); +print "JIS to SJIS... ";test('-s',example['jis1'],[example['sjis1']]); +print "JIS to EUC ... ";test('-e',example['jis1'],[example['euc1']]); +print "JIS to UTF8... ";test('-w',example['jis1'],[example['utf1']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis1'],[example['jis1']]); +print "SJIS to SJIS... ";test('-s',example['sjis1'],[example['sjis1']]); +print "SJIS to EUC ... ";test('-e',example['sjis1'],[example['euc1']]); +print "SJIS to UTF8... ";test('-w',example['sjis1'],[example['utf1']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc1'],[example['jis1']]); +print "EUC to SJIS... ";test('-s',example['euc1'],[example['sjis1']]); +print "EUC to EUC ... ";test('-e',example['euc1'],[example['euc1']]); +print "EUC to UTF8... ";test('-w',example['euc1'],[example['utf1']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf1'],[example['jis1']]); +print "UTF8 to SJIS... ";test('-s',example['utf1'],[example['sjis1']]); +print "UTF8 to EUC ... ";test('-e',example['utf1'],[example['euc1']]); +print "UTF8 to UTF8... ";test('-w',example['utf1'],[example['utf1']]); + +# Ambigous Case example['amb'] = <<'eofeof'.unpack('u')[0] MI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&E @@ -117,6 +242,31 @@ M)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q >)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(* eofeof +print "Ambiguous Case. "; + test('-j',example['amb'],[example['amb.euc']]); + +# Input assumption + +print "SJIS Input assumption "; + test('-jSx',example['amb'],[example['amb.sjis']]); + +# Broken JIS + +print "Broken JIS "; + $input = example['jis']; + $input.gsub("\033",''); + test('-Be',$input,[example['euc']]); +print "Broken JIS is safe on Normal JIS? "; + $input = example['jis']; + test('-Be',$input,[example['euc']]); + +# X0201 仮名 +# X0201->X0208 conversion +# X0208 aphabet -> ASCII +# X0201 相互変換 + +print "\nX0201 test\n\n"; + example['x0201.sjis'] = <<'eofeof'.unpack('u')[0] MD5.*<(-*@TR#3H-0@U*#2X--@T^#48-3"I%3B7""8()A@F*"8X)D@F6"9H*! M@H*"@X*$@H6"AH*'"I%3BTR-AH%)@9>!E(&0@9.!3X&5@9:!:8%J@7R!>X&! @@ -124,7 +274,6 @@ M@6V!;H%O@7"!CPJ4O(IPMK>X/;FZMMZWWKC>N=ZZWH+&"I2\BG#*W\O?S-_- MW\[?M]^QW@K*W\O?S`IH86YK86MU(,K?R]_,I`K*W\O?S-VA"I2\BG""S(SC !"@!" eofeof -#' example['x0201.euc'] = <<'eofeof'.unpack('u')[0] MP;2ST:6KI:VEKZ6QI;.EK*6NI;"ELJ6T"L&TL=&CP:/"H\.CQ*/%H\:CQZ/A @@ -134,7 +283,17 @@ MWJ3("LB^L]&.RH[?CLN.WX[,CM^.S8[?CLZ.WXZWCM^.L8[>"H[*CM^.RX[? MCLP*:&%N:V%K=2".RH[?CLN.WX[,CJ0*CLJ.WX[+CM^.S([=CJ$*R+ZST:3. #N.4* eofeof -#' + +example['x0201.utf'] = <<'eofeof'.unpack('u')[0] +MY86HZ*>2XX*KXX*MXX*OXX*QXX*SXX*LXX*NXX*PXX*RXX*T"N6%J.B+L>^\ +MH>^\HN^\H^^\I.^\I>^\IN^\I^^]@>^]@N^]@^^]A.^]A>^]AN^]APKEA:CH +MJ)CEC[?OO('OO*#OO(/OO(3OO(7OO+[OO(;OO(KOO(COO(GBB)+OO(OOO)WO +MO+OOO+WOO9OOO9WOOZ4*Y8V*Z*>2[[VV[[VW[[VX/>^]N>^]NN^]MN^^GN^] +MM^^^GN^]N.^^GN^]N>^^GN^]NN^^GN.!J`KEC8KHIY+OOHKOOI_OOHOOOI_O +MOHSOOI_OOHWOOI_OOH[OOI_OO;?OOI_OO;'OOIX*[[Z*[[Z?[[Z+[[Z?[[Z, +M"FAA;FMA:W4@[[Z*[[Z?[[Z+[[Z?[[Z,[[VD"N^^BN^^G^^^B^^^G^^^C.^^ +2G>^]H0KEC8KHIY+C@:[EOHP* +eofeof example['x0201.jis'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$C02-"(T,C @@ -144,7 +303,6 @@ M/1LH23DZ-EXW7CA>.5XZ7ALD0B1(&RA""ALD0D@^,U$;*$E*7TM?3%]-7TY? M-U\Q7ALH0@H;*$E*7TM?3!LH0@IH86YK86MU(!LH24I?2U],)!LH0@H;*$E* 97TM?3%TA&RA""ALD0D@^,U$D3CAE&RA""@`` eofeof -#` example['x0201.sosi'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA*"ALD0D$T,5$C02-"(T,C @@ -154,7 +312,6 @@ M*$H]#CDZ-EXW7CA>.5XZ7@\;)$(D2!LH2@H;)$)(/C-1&RA*#DI?2U],7TU? M3E\W7S%>#PH.2E]+7TP/&RA*"FAA;FMA:W4@#DI?2U],)`\;*$H*#DI?2U], 672$/&RA*"ALD0D@^,U$D3CAE&RA""@`` eofeof -#" example['x0201.x0208'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$;*$)!0D-$ @@ -164,7 +321,34 @@ M)$)(/C-1)5$E5"57)5HE724M(2PE(B$K&RA""ALD0B51)50E51LH0@IH86YK M86MU(!LD0B51)50E52$B&RA""ALD0B51)50E525S(2,;*$(*&R1"2#XS421. &.&4;*$(* eofeof -#` + +# -X is necessary to allow X0201 in SJIS +# -Z convert X0208 alphabet to ASCII +print "X0201 conversion: SJIS "; + test('-jXZ',example['x0201.sjis'],[example['x0201.x0208']]); +print "X0201 conversion: JIS "; + test('-jZ',example['x0201.jis'],[example['x0201.x0208']]); +print "X0201 conversion:SI/SO "; + test('-jZ',example['x0201.sosi'],[example['x0201.x0208']]); +print "X0201 conversion: EUC "; + test('-jZ',example['x0201.euc'],[example['x0201.x0208']]); +print "X0201 conversion: UTF8 "; + test('-jZ',example['x0201.utf'],[example['x0201.x0208']]); +# -x means X0201 output +print "X0201 output: SJIS "; + test('-xs',example['x0201.euc'],[example['x0201.sjis']]); +print "X0201 output: JIS "; + test('-xj',example['x0201.sjis'],[example['x0201.jis']]); +print "X0201 output: EUC "; + test('-xe',example['x0201.jis'],[example['x0201.euc']]); +print "X0201 output: UTF8 "; + test('-xw',example['x0201.jis'],[example['x0201.utf']]); + +# MIME decode + +print "\nMIME test\n\n"; + +# MIME ISO-2022-JP example['mime.iso2022'] = <<'eofeof'.unpack('u')[0] M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23TI566Q/4U9)1WEH2S\]"CT_ @@ -178,7 +362,6 @@ M96E23U!Y:S=D"FAS;U-G/3T_/2`]/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W M96E23U!Y:S=D:'-O4V<]/3\]"CT_25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I 44D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.ans.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -188,7 +371,6 @@ M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D"FAS;U-G/3T_/2`] M/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D:'-O4V<]/3\]"CT_ L25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I4D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.unbuf.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -206,7 +388,6 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.unbuf'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -215,21 +396,48 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.base64'] = <<'eofeof'.unpack('u')[0] M9W-M5"])3&YG$I+-&=Q=4,S24LS9W%Q0E%:3TUI-39,,S0Q-&=S5T)1 M43!+9VUA1%9O3T@*9S)+1%1O3'=K8C)1;$E+;V=Q2T-X24MG9W5M0W%*3EEG <$E+9V=U;4,X64Q&9W)70S592VMG<6U""F=Q eofeof -#" example['mime.base64.ans'] = <<'eofeof'.unpack('u')[0] M&R1")$M&?B1I)#LD1D0Z)"TD7B0Y)"PA(D5L-7XV83E9)$B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"tmp1");print OUT pack('u',$tmp);close(OUT); +# unbuf mode implies more pessimistic decode +print "MIME decode (unbuf) "; + $tmp = test('-jmNu',example['mime.iso2022'],[example['mime.unbuf'],example['mime.unbuf.alt']]); + # open(OUT,">tmp2");print OUT pack('u',$tmp);close(OUT); +print "MIME decode (base64) "; + test('-jTmB',example['mime.base64'],[example['mime.base64.ans']]); + +# MIME ISO-8859-1 example['mime.is8859'] = <<'eofeof'.unpack('u')[0] M/3])4T\M.#@U.2TQ/U$_*CU#-V%V83\_/2`*4&5E2P@1$5.34%22R`@7"`B36EN(&OF<&AEX0208 conversion -# X0208 aphabet -> ASCII -# X0201 相互変換 +example['test_data/cr.ans'] = <<'eofeof'.unpack('u')[0] +7&R1")$8D.21(&RA""G1EI**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DSR!# +M4B],1B"DSKG4H:,-"J2SI+.DSR!#4B"DSKG4H:,-I+.DLZ3/($Q&+T-2(*3. +9N=2AHPH-"J2SI+.DSR!,1B"DSKG4H:,*"@`` +eofeof -print "Next test is expeced to Fail.\n" +example['test_data/long-fold-1.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#tmp1');print OUT pack('u',$tmp);close(OUT); -# unbuf mode implies more pessimistic decode -print 'MIME decode (unbuf) ' -test('-mu', example['mime.iso2022'], example['mime.unbuf']) -print 'MIME decode (base64) ' -t = test('-mB', example['mime.base64'], example['mime.base64.ans']) +print "test_data/long-fold-1 "; + test('-jTF60',example['test_data/long-fold-1'],[example['test_data/long-fold-1.ans']]); +# test_data/long-fold -# MIME ISO-8859-1 +example['test_data/long-fold'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AHJ2SI.RD +M\J2]I,ZDWJ3>I**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DS\.[ +'I*2YU*&C"@`` +eofeof -# Without -l, ISO-8859-1 was handled as X0201. +example['test_data/long-fold.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#5)#2D-):TI#46U*0V=K2VE1 +M5%X2D1-:TY343-*1&MK3WAS;U%G/3T_/2`*"3T_25-/+3(P,C(M +M2E`_0C]'>5)#2D0P:U!Y4D)*15%K4FE224I%;VM3>5)-2D4P:U1I4E!*1DEK +M5E-264=Y:$,_/2`*"3T_25-/+3(P,C(M2E`_0C]'>5)#2D9S:UAI4F9*1T%K +M65-2:4I'46M*0U)M2D-G:V%"A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +example['test_data/multi-line.ans'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AH@"DLZ3L +MI/*DO:3.I-ZDWJ2BI,2DJZ2FI,BAHJ3)I*:DXJ3?I-"DJ*2LI.^DZZ2DI+>A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +print "test_data/multi-line "; + test('-e',example['test_data/multi-line'],[example['test_data/multi-line.ans']]); +# test_data/nkf-19-bug-1 + +example['test_data/nkf-19-bug-1'] = <<'eofeof'.unpack('u')[0] +,I*:DJZ2D"KK8QJ,* +eofeof + +example['test_data/nkf-19-bug-1.ans'] = <<'eofeof'.unpack('u')[0] +8&R1")"8D*R0D&RA""ALD0CI81B,;*$(* +eofeof + +print "test_data/nkf-19-bug-1 "; + test('-Ej',example['test_data/nkf-19-bug-1'],[example['test_data/nkf-19-bug-1.ans']]); +# test_data/nkf-19-bug-2 + +example['test_data/nkf-19-bug-2'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +example['test_data/nkf-19-bug-2.ans'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +print "test_data/nkf-19-bug-2 "; + test('-Ee',example['test_data/nkf-19-bug-2'],[example['test_data/nkf-19-bug-2.ans']]); +# test_data/nkf-19-bug-3 + +example['test_data/nkf-19-bug-3'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['test_data/nkf-19-bug-3.ans'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +print "test_data/nkf-19-bug-3 "; + test('-e',example['test_data/nkf-19-bug-3'],[example['test_data/nkf-19-bug-3.ans']]); +# test_data/non-strict-mime + +example['test_data/non-strict-mime'] = <<'eofeof'.unpack('u')[0] +M/3])4T\M,C`R,BU*4#]"/PIG$E+:6=R,D-V;TMI +,9W-30V]O3&,*/ST* +eofeof + +example['test_data/non-strict-mime.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$8D)"0_)$`D)"1&)%XD.2$C&RA"#0H-"ALD0CMD)$\[?B$Y)6PE.21+ +<)&(]<20K)#LD1B0D)#\D0"0D)$8D)"1>&RA""@`` +eofeof + +print "test_data/non-strict-mime "; + test('-jTmN',example['test_data/non-strict-mime'],[example['test_data/non-strict-mime.ans']]); +# test_data/q-encode-softrap + +example['test_data/q-encode-softrap'] = <<'eofeof'.unpack('u')[0] +H/3%")$(T03MZ)3T*,R$\)4DD3CTQ0BA""CTQ0B1"2E$T.3TQ0BA""@`` +eofeof + +example['test_data/q-encode-softrap.ans'] = <<'eofeof'.unpack('u')[0] +>&R1"-$$[>B4S(3PE221.&RA""ALD0DI1-#D;*$(* +eofeof + +print "test_data/q-encode-softrap "; + test('-jTmQ',example['test_data/q-encode-softrap'],[example['test_data/q-encode-softrap.ans']]); +# test_data/rot13 + +example['test_data/rot13'] = <<'eofeof'.unpack('u')[0] +MI+.D\Z3+I,&DSZ&BS:W"]*3(I*2DI*3>I+FAHPH*;FMF('9EI+FDK*&B05-#24D@I,O"T*2WI,8@ +M4D]4,3,@I*P*P+6DMZ2OQK"DI*3&I*2DRJ2DI.BDIJ3'H:*PRK*\I,ZDZ*2F +MI,O*T;2YI+6D[*3>I+ND\Z&C"@HE(&5C:&\@)VAO9V4G('P@;FMF("UR"FAO +#9V4* +eofeof + +example['test_data/rot13.ans'] = <<'eofeof'.unpack('u')[0] +M&R1"4V)31%-Z4W!3?E!1?%QQ15-W4U-34U,O4VA04ALH0@H*87AS(&ER92XQ +M+CDR(!LD0E-#?$E\(E-D4VI3=5-34VY3;U-34W534U,O4VA36U!1&RA"3D90 +M5E8@&R1"4WIQ(5-F4W4;*$(@14)',3,@&R1"4UL;*$(*&R1";V139E->=5]3 +M4U-U4U-3>5-34SE355-V4%%?>6%K4WU3.5-54WIY(F-H4V13/5,O4VI31%!2 +A&RA""@HE(')P=6(@)W5B='(G('P@87AS("UE"G5B='(* +eofeof + +print "test_data/rot13 "; + test('-jr',example['test_data/rot13'],[example['test_data/rot13.ans']]); +# test_data/slash + +example['test_data/slash'] = <<'eofeof'.unpack('u')[0] +7("`]/U8\5"U5.5=%2RTK.U