diff options
author | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-03 07:30:42 +0000 |
---|---|---|
committer | naruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> | 2004-11-03 07:30:42 +0000 |
commit | dbe30b383b6d63ea3395b79719d53616beb86e46 (patch) | |
tree | 2e1ce48ba33b3bbdd9a5b1f5e2c1b91c63f34c25 | |
parent | e3475cde2a53e9ce560818146dd09f7d4d3d6fbf (diff) |
* follow nkf 2.0.4
git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/branches/ruby_1_8@7186 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r-- | ext/nkf/depend | 2 | ||||
-rw-r--r-- | ext/nkf/lib/kconv.rb | 197 | ||||
-rw-r--r-- | ext/nkf/nkf.c | 191 | ||||
-rw-r--r-- | ext/nkf/nkf1.7/nkf.c | 1900 | ||||
-rw-r--r-- | ext/nkf/test.rb | 538 |
5 files changed, 780 insertions, 2048 deletions
diff --git a/ext/nkf/depend b/ext/nkf/depend index 13e32e6074..0ed8fea8d2 100644 --- a/ext/nkf/depend +++ b/ext/nkf/depend @@ -1 +1 @@ -nkf.o : nkf.c $(hdrdir)/ruby.h $(topdir)/config.h $(hdrdir)/defines.h $(srcdir)/nkf1.7/nkf.c +nkf.o : nkf.c $(hdrdir)/ruby.h $(topdir)/config.h $(hdrdir)/defines.h $(srcdir)/nkf-utf8/nkf.c $(srcdir)/nkf-utf8/utf8tbl.c $(srcdir)/nkf-utf8/config.h diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb index af6d82275f..1fd28a5a59 100644 --- a/ext/nkf/lib/kconv.rb +++ b/ext/nkf/lib/kconv.rb @@ -1,73 +1,226 @@ require 'nkf' module Kconv - AUTO = NKF::AUTO - JIS = NKF::JIS - EUC = NKF::EUC - SJIS = NKF::SJIS - BINARY = NKF::BINARY - NOCONV = NKF::NOCONV - UNKNOWN = NKF::UNKNOWN + #Constant of Encoding + AUTO = ::NKF::AUTO + JIS = ::NKF::JIS + EUC = ::NKF::EUC + SJIS = ::NKF::SJIS + BINARY = ::NKF::BINARY + NOCONV = ::NKF::NOCONV + ASCII = ::NKF::ASCII + UTF8 = ::NKF::UTF8 + UTF16 = ::NKF::UTF16 + UTF32 = ::NKF::UTF32 + UNKNOWN = ::NKF::UNKNOWN + + #Regexp of Encoding + Iconv_Shift_JIS = /\A(?: + [\x00-\x7f\xa1-\xdf] | + \x81[\x40-\x7e\x80-\xac\xb8-\xbf\xc8-\xce\xda-\xe8\xf0-\xf7\xfc] | + \x82[\x4f-\x58\x60-\x79\x81-\x9a\x9f-\xf1] | + \x83[\x40-\x7e\x80-\x96\x9f-\xb6\xbf-\xd6\x40-\x60] | + \x84[\x40-\x60\x70-\x7e\x80-\x91\x9f-\xbe\x9f-\xfc] | + [\x89-\x8f\x90-\x97\x99-\x9f\xe0-\xea][\x40-\x7e] | + [\x89-\x97\x99-\x9f\xe0-\xe9][\x80-\xfc] | + \x98[\x40-\x72\x9f-\xfc] | + \xea[\x80-\xa4] + )*\z/nx + Iconv_EUC_JP = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1\xb0-\xbce\xd0-\xf3][\xa1-\xfe] | + \xa2[\xa1-\xae\xba-\xc1\xca-\xd0\xdc-\xea\xf2-\xf9\xfe] | + \xa3[\xb0-\xb9\xc1-\xda\xe1-\xfa] | + \xa4[\xa1-\xf3] | + \xa5[\xa1-\xf6] | + \xa6[\xa1-\xb8\xc1-\xd8] | + \xa7[\xa1-\xc1\xd1-\xf1] | + \xa8[\xa1-\xc0] | + \xcf[\xa1-\xd3] | + \xf4[\xa1-\xa6] + )*\z/nx + Iconv_UTF8 = /\A(?:\xef\xbb\xbf)?(?: + [\x00-\x7f] | + \xc2[\x80-\x8d\x90-\x9f\xa1\xaa\xac\xae-\xb1\xb4\xb6\xb8\xba\xbf] | + \xc3[\x80-\xbf] | + \xc4[\x80-\x93\x96-\xa2\xa4-\xab\xae-\xbf] | + \xc5[\x80-\x8d\x90-\xbe] | + \xc7[\x8d-\x9c\xb5] | + \xcb[\x87\x98-\x9b\x9d] | + \xce[\x84-\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf] | + \xcf[\x80-\x8e] | + \xd0[\x81-\x8c\x8e-\xbf] | + \xd1[\x80-\x8f\x91-\x9f] | + \xe2\x84[\x83\x96\xa2\xab] | + \xe2\x86[\x83\x91-\x93\x96\xa2\xab] | + \xe2\x87[\x83\x91-\x94\x96\xa2\xab] | + \xe2\x88[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0\xa2\xa7-\xac\xb4-\xb5\xbd] | + \xe2\x89[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa6-\xac\xb4-\xb5\xbd] | + \xe2[\x8a\x8c][\x82-\x83\x86-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa5-\xac\xb4-\xb5\xbd] | + \xe2[\x94-\x99][\x81-\x83\x86-\x88\x8b-\x8c\x8f-\x94\x96-\x98\x9a-\x9e\xa0-\xac\xaf-\xb0\xb3-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3\x80[\x81-\x83\x85-\x98\x9a-\x9e\xa0-\xad\xaf-\xb0\xb2-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3[\x81-\x83\xb8-\xbf][\x81-\xbf] | + [\xe5-\xe7][\x80-\xbf][\x81-\xbf] | + \xe8[\x80-\xae\xb0-\xbf][\x81-\xbf] | + \xe9[\x80-\x92\x95-\xb1\xb3-\xbe][\x81-\xbf] | + \xef[\xbc-\xbe][\x81-\xbf] | + )*\z/nx + RegexpShiftjis = /\A(?: + [\x00-\x7f\xa1-\xdf] | + [\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc] + )*\z/nx + RegexpEucjp = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1-\xdf] [\xa1-\xdf] + )*\z/nx + RegexpUtf8 = /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] + )*\z/nx + + # + # kconv + # + def kconv(str, out_code, in_code = AUTO) opt = '-' case in_code - when NKF::JIS + when ::NKF::JIS opt << 'J' - when NKF::EUC + when ::NKF::EUC opt << 'E' - when NKF::SJIS + when ::NKF::SJIS opt << 'S' + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'W' end case out_code - when NKF::JIS + when ::NKF::JIS opt << 'j' - when NKF::EUC + when ::NKF::EUC opt << 'e' - when NKF::SJIS + when ::NKF::SJIS opt << 's' - when NKF::NOCONV + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'w' + when ::NKF::NOCONV return str end opt = '' if opt == '-' - NKF::nkf(opt, str) + ::NKF::nkf(opt, str) end module_function :kconv + # + # Encode to + # + def tojis(str) - NKF::nkf('-j', str) + ::NKF::nkf('-j', str) end module_function :tojis def toeuc(str) - NKF::nkf('-e', str) + ::NKF::nkf('-e', str) end module_function :toeuc def tosjis(str) - NKF::nkf('-s', str) + ::NKF::nkf('-s', str) end module_function :tosjis + def toutf8(str) + ::NKF::nkf('-w', str) + end + module_function :toutf8 + + def toutf16(str) + ::NKF::nkf('-w16', str) + end + module_function :toutf16 + + # + # guess + # + def guess(str) - NKF::guess(str) + ::NKF::guess(str) end module_function :guess + + def guess_old(str) + ::NKF::guess_old(str) + end + module_function :guess_old + + # + # isEncoding + # + + def iseuc(str) + RegexpEucjp.match( str ) + end + module_function :iseuc + + def issjis(str) + RegexpShiftjis.match( str ) + end + module_function :issjis + + def isutf8(str) + RegexpUtf8.match( str ) + end + module_function :isutf8 + end class String def kconv(out_code, in_code=Kconv::AUTO) Kconv::kconv(self, out_code, in_code) end + + # to Encoding def tojis - NKF::nkf('-j', self) + ::NKF::nkf('-j', self) end def toeuc - NKF::nkf('-e', self) + ::NKF::nkf('-e', self) end def tosjis - NKF::nkf('-s', self) + ::NKF::nkf('-s', self) + end + def toutf8 + ::NKF::nkf('-w', self) + end + def toutf16 + ::NKF::nkf('-w16', self) + end + + # is Encoding + def iseuc + Kconv.iseuc( self ) + end + + def issjis + Kconv.issjis( self ) + end + + def isutf8 + Kconv.isutf8( self ) end end diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c index ca6de73e10..6517b3aba1 100644 --- a/ext/nkf/nkf.c +++ b/ext/nkf/nkf.c @@ -1,51 +1,82 @@ +/* + * NKF Module for Ruby base on nkf 2.x + * + * original nkf2.0 is maintained at http://sourceforge.jp/projects/nkf/ + * + */ + +static char *RVersion = "2.0.4.1r1"; + #include "ruby.h" +/* Encoding Constants */ #define _AUTO 0 #define _JIS 1 #define _EUC 2 #define _SJIS 3 #define _BINARY 4 #define _NOCONV 4 +#define _ASCII 5 +/* 0b011x is reserved for UTF-8 Family */ +#define _UTF8 6 +/* 0b10xx is reserved for UTF-16 Family */ +#define _UTF16 8 +/* 0b11xx is reserved for UTF-32 Family */ +#define _UTF32 12 +#define _OTHER 16 #define _UNKNOWN _AUTO +/* Replace nkf's getchar/putchar for variable modification */ +/* we never use getc, ungetc */ + #undef getc #undef ungetc -#define getc(f) (input_ctr<i_len?input[input_ctr++]:-1) -#define ungetc(c,f) input_ctr-- +#define getc(f) (input_ctr>=i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- +#define INCSIZE 32 #undef putchar -#define putchar(c) rb_nkf_putchar(c) +#undef TRUE +#undef FALSE +#define putchar(c) rb_nkf_putchar(c) -#define INCSIZE 32 -static int incsize; +/* Input/Output pointers */ -static unsigned char *input, *output; -static int input_ctr, i_len; -static int output_ctr, o_len; +static unsigned char *output; +static unsigned char *input; +static int input_ctr; +static int i_len; +static int output_ctr; +static int o_len; +static int incsize; -static VALUE dst; +static VALUE result; static int rb_nkf_putchar(c) - unsigned int c; + unsigned int c; { if (output_ctr >= o_len) { o_len += incsize; - rb_str_resize(dst, o_len); - output = RSTRING(dst)->ptr; + rb_str_resize(result, o_len); incsize *= 2; + output = RSTRING(result)->ptr; } output[output_ctr++] = c; return c; } +/* Include kanji filter main part */ +/* getchar and putchar will be replaced during inclusion */ + #define PERL_XS 1 -#include "nkf1.7/nkf.c" +#include "nkf-utf8/utf8tbl.c" +#include "nkf-utf8/nkf.c" static VALUE rb_nkf_kconv(obj, opt, src) - VALUE obj, opt, src; + VALUE obj, opt, src; { char *opt_ptr, *opt_end; volatile VALUE v; @@ -58,44 +89,46 @@ rb_nkf_kconv(obj, opt, src) if (*opt_ptr != '-') { continue; } - arguments(opt_ptr); + options(opt_ptr); } incsize = INCSIZE; - input_ctr = 0; + input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; - dst = rb_str_new(0, i_len*3 + 10); - v = dst; + result = rb_str_new(0, i_len*3 + 10); + v = result; output_ctr = 0; - output = RSTRING(dst)->ptr; - o_len = RSTRING(dst)->len; + output = RSTRING(result)->ptr; + o_len = RSTRING(result)->len; *output = '\0'; - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - iso8859_f = FALSE; - } + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); kanji_convert(NULL); - RSTRING(dst)->ptr[output_ctr] = '\0'; - RSTRING(dst)->len = output_ctr; - OBJ_INFECT(dst, src); + RSTRING(result)->ptr[output_ctr] = '\0'; + RSTRING(result)->len = output_ctr; + OBJ_INFECT(result, src); - return dst; + return result; } + /* + * NKF.guess1 + * * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE -rb_nkf_guess(obj, src) - VALUE obj, src; +rb_nkf_guess1(obj, src) + VALUE obj, src; { unsigned char *p; unsigned char *pend; @@ -107,16 +140,16 @@ rb_nkf_guess(obj, src) if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ - p++;\ - if (p==pend) return INT2FIX(_UNKNOWN);\ - sequence_counter++;\ - if (sequence_counter % 2 == 1 && *p != 0xa4)\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ - if (6 <= sequence_counter) {\ - sequence_counter = 0;\ - return INT2FIX(_EUC);\ - }\ -} while (0) + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ + } while (0) if (*p == 0xa4) sequence_counter = 1; @@ -180,19 +213,77 @@ rb_nkf_guess(obj, src) return INT2FIX(_UNKNOWN); } + +/* + * NKF.guess2 + * + * Guess Encoding By NKF2.0 Routine + */ + +static VALUE +rb_nkf_guess2(obj, src) + VALUE obj, src; +{ + int code = _BINARY; + + reinit(); + + input_ctr = 0; + StringValue(src); + input = RSTRING(src)->ptr; + i_len = RSTRING(src)->len; + + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); + + guess_f = TRUE; + kanji_convert( NULL ); + guess_f = FALSE; + + if (!is_inputcode_mixed) { + if (strcmp(input_codename, "") == 0) { + code = _ASCII; + } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { + code = _JIS; + } else if (strcmp(input_codename, "EUC-JP") == 0) { + code = _EUC; + } else if (strcmp(input_codename, "Shift_JIS") == 0) { + code = _SJIS; + } else if (strcmp(input_codename, "UTF-8") == 0) { + code = _UTF8; + } else if (strcmp(input_codename, "UTF-16") == 0) { + code = _UTF16; + } else if (strlen(input_codename) > 0) { + code = _UNKNOWN; + } + } + + return INT2FIX( code ); +} + + +/* Initialize NKF Module */ + void Init_nkf() { - VALUE mKconv = rb_define_module("NKF"); - - rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); - rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); - - rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); - rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); - rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); - rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); - rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); - rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); - rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII)); + rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8)); + rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16)); + rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + rb_define_const(mKconv, "VERSION", rb_str_new2(RVersion)); } diff --git a/ext/nkf/nkf1.7/nkf.c b/ext/nkf/nkf1.7/nkf.c deleted file mode 100644 index 09419f40a7..0000000000 --- a/ext/nkf/nkf1.7/nkf.c +++ /dev/null @@ -1,1900 +0,0 @@ -/** Network Kanji Filter. (PDS Version) -************************************************************************ -** Copyright (C) 1987, Fujitsu LTD. (Itaru ICHIKAWA) -** $BO"Mm@h!'(B $B!J3t!KIY;NDL8&5f=j!!%=%U%H#38&!!;T@n!!;j(B -** $B!J(BE-Mail Address: ichikawa@flab.fujitsu.co.jp$B!K(B -** Copyright (C) 1996,1998 -** $BO"Mm@h!'(B $BN05eBg3X>pJs9)3X2J(B $B2OLn(B $B??<#(B mine/X0208 support -** $B!J(BE-Mail Address: kono@ie.u-ryukyu.ac.jp$B!K(B -** $BO"Mm@h!'(B COW for DOS & Win16 & Win32 & OS/2 -** $B!J(BE-Mail Address: GHG00637@niftyserve.or.p$B!K(B -** $B$3$N%=!<%9$N$$$+$J$kJ#<L!$2~JQ!$=$@5$b5vBz$7$^$9!#$?$@$7!"(B -** $B$=$N:]$K$O!"C/$,9W8%$7$?$r<($9$3$NItJ,$r;D$9$3$H!#(B -** $B:FG[I[$d;(;o$NIUO?$J$I$NLd$$9g$o$;$bI,MW$"$j$^$;$s!#(B -** $B$3$N%W%m%0%i%`$K$D$$$F$OFC$K2?$NJ]>Z$b$7$J$$!"0-$7$+$i$:!#(B -** Everyone is permitted to do anything on this program -** including copying, modifying, improving. -** as long as you don't try to pretend that you wrote it. -** i.e., the above copyright notice has to appear in all copies. -** You don't have to ask before copying or publishing. -** THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE. -***********************************************************************/ - -static char *CopyRight = - "Copyright (C) 1987, FUJITSU LTD. (I.Ichikawa),1998 S. Kono, COW"; -static char *Version = - "1.7"; -static char *Patchlevel = - "0/9711/Shinji Kono"; - -/* -** -** -** -** USAGE: nkf [flags] [file] -** -** Flags: -** b Output is bufferred (DEFAULT) -** u Output is unbufferred -** -** t no operation -** -** j Outout code is JIS 7 bit (DEFAULT SELECT) -** s Output code is MS Kanji (DEFAULT SELECT) -** e Output code is AT&T JIS (DEFAULT SELECT) -** l Output code is JIS 7bit and ISO8859-1 Latin-1 -** -** m MIME conversion for ISO-2022-JP -** i_ Output sequence to designate JIS-kanji (DEFAULT_J) -** o_ Output sequence to designate single-byte roman characters (DEFAULT_R) -** -** r {de/en}crypt ROT13/47 -** -** v display Version -** -** T Text mode output (for MS-DOS) -** -** x Do not convert X0201 kana into X0208 -** Z Convert X0208 alphabet to ASCII -** -** f60 fold option -** -** m MIME decode -** B try to fix broken JIS, missing Escape -** B[1-9] broken level -** -** O Output to 'nkf.out' file -** d Delete \r in line feed -** c Add \r in line feed -**/ -/******************************/ -/* $B%G%U%)%k%H$N=PNO%3!<%IA*Br(B */ -/* Select DEFAULT_CODE */ -#define DEFAULT_CODE_JIS -/* #define DEFAULT_CODE_SJIS */ -/* #define DEFAULT_CODE_EUC */ -/******************************/ - -#if (defined(__TURBOC__) || defined(LSI_C)) && !defined(MSDOS) -#define MSDOS -#endif - -#ifndef PERL_XS -#include <stdio.h> -#endif - -#if defined(MSDOS) || defined(__OS2__) -#include <stdlib.h> -#include <fcntl.h> -#include <io.h> -#endif - -#ifdef MSDOS -#ifdef LSI_C -#define setbinmode(fp) fsetbin(fp) -#else /* Microsoft C, Turbo C */ -#define setbinmode(fp) setmode(fileno(fp), O_BINARY) -#endif -#else /* UNIX,OS/2 */ -#define setbinmode(fp) -#endif - -#ifdef _IOFBF /* SysV and MSDOS */ -#define setvbuffer(fp, buf, size) setvbuf(fp, buf, _IOFBF, size) -#else /* BSD */ -#define setvbuffer(fp, buf, size) setbuffer(fp, buf, size) -#endif - -/*Borland C++ 4.5 EasyWin*/ -#if defined(__TURBOC__) && defined(_Windows) && !defined(__WIN32__) /*Easy Win */ -#define EASYWIN -#include <windows.h> -#endif - -#define FALSE 0 -#define TRUE 1 - -/* state of output_mode and input_mode */ - -#define ASCII 0 -#define X0208 1 -#define X0201 2 -#define NO_X0201 3 -#define JIS_INPUT 4 -#define SJIS_INPUT 5 -#define LATIN1_INPUT 6 -#define FIXED_MIME 7 -#define DOUBLE_SPACE -2 - -#define NL 0x0a -#define ESC 0x1b -#define SPACE 0x20 -#define AT 0x40 -#define SSP 0xa0 -#define DEL 0x7f -#define SI 0x0f -#define SO 0x0e -#define SSO 0x8e - -#define HOLD_SIZE 32 -#define IOBUF_SIZE 16384 - -#define DEFAULT_J 'B' -#define DEFAULT_R 'B' - -#define SJ0162 0x00e1 /* 01 - 62 ku offset */ -#define SJ6394 0x0161 /* 63 - 94 ku offset */ - - -/* MIME preprocessor */ - -#undef STRICT_MIME /* do stupid strict mime integrity check */ -#define GETC(p) ((!mime_mode)?getc(p):mime_getc(p)) -#define UNGETC(c,p) ((!mime_mode)?ungetc(c,p):mime_ungetc(c)) - - -#ifdef EASYWIN /*Easy Win */ -extern POINT _BufferSize; -#endif - -/* function prototype */ - -#ifndef _ -# ifdef __STDC__ -# define _(args) args -# else -# define _(args) () -# endif -#endif - -#ifndef PERL_XS -static void noconvert _((FILE *f)); -static int mime_integrity _((FILE *f,unsigned char *p)); -static int usage _((void)); -static char stdibuf[IOBUF_SIZE]; -static char stdobuf[IOBUF_SIZE]; -static unsigned int mime_input = 0; /* undecoded */ -static int end_check; -#endif - -static void kanji_convert _((FILE *f)); -static void h_conv _((FILE *f,int c2,int c1)); -static int push_hold_buf _((int c2,int c1)); -static void s_iconv _((int c2,int c1)); -static void e_oconv _((int c2,int c1)); -static void s_oconv _((int c2,int c1)); -static void j_oconv _((int c2,int c1)); -static int line_fold _((int c2,int c1)); -static int pre_convert _((int c1,int c2)); -static int mime_begin _((FILE *f)); -static int mime_getc _((FILE *f)); -static int mime_ungetc _((unsigned int c)); -static int base64decode _((int c)); -static void arguments _((char *c)); -static void reinit _((void)); - -/* buffers */ - -static unsigned char hold_buf[HOLD_SIZE*2]; -static int hold_count; - -/* MIME preprocessor fifo */ - -#define MIME_BUF_SIZE (1024) /* 2^n ring buffer */ -#define MIME_BUF_MASK (MIME_BUF_SIZE-1) -#define Fifo(n) mime_buf[(n)&MIME_BUF_MASK] -static unsigned char mime_buf[MIME_BUF_SIZE]; -static unsigned int mime_top = 0; -static unsigned int mime_last = 0; /* decoded */ - -/* flags */ -static int unbuf_f = FALSE; -static int estab_f = FALSE; -static int nop_f = FALSE; -static int binmode_f = TRUE; /* binary mode */ -static int rot_f = FALSE; /* rot14/43 mode */ -static int input_f = FALSE; /* non fixed input code */ -static int alpha_f = FALSE; /* convert JIx0208 alphbet to ASCII */ -static int mime_f = TRUE; /* convert MIME B base64 or Q */ -static int mimebuf_f = FALSE; /* MIME buffered input */ -static int broken_f = FALSE; /* convert ESC-less broken JIS */ -static int iso8859_f = FALSE; /* ISO8859 through */ -#if defined(MSDOS) || defined(__OS2__) -static int x0201_f = TRUE; /* Assume JISX0201 kana */ -#else -static int x0201_f = NO_X0201; /* Assume NO JISX0201 */ -#endif - -/* X0208 -> ASCII converter */ - -static int c1_return; - -/* fold parameter */ -static int line = 0; /* chars in line */ -static int prev = 0; -static int fold_f = FALSE; -static int fold_len = 0; - -/* options */ -static char kanji_intro = DEFAULT_J, - ascii_intro = DEFAULT_R; - -/* Folding */ - -int line_fold(); -#define FOLD_MARGIN 10 -#define DEFAULT_FOLD 60 - -/* converters */ - -#ifdef DEFAULT_CODE_JIS -# define DEFAULT_CONV j_oconv -#endif -#ifdef DEFAULT_CODE_SJIS -# define DEFAULT_CONV s_oconv -#endif -#ifdef DEFAULT_CODE_EUC -# define DEFAULT_CONV e_oconv -#endif - -static void (*iconv) _((int c2,int c1)); - /* s_iconv or oconv */ -static void (*oconv) _((int c2,int c1)) = DEFAULT_CONV; - /* [ejs]_oconv */ - -/* Global states */ -static int output_mode = ASCII, /* output kanji mode */ - input_mode = ASCII, /* input kanji mode */ - shift_mode = FALSE; /* TRUE shift out, or X0201 */ -static int mime_mode = FALSE; /* MIME mode B base64, Q hex */ - -/* X0201 / X0208 conversion tables */ - -/* X0201 kana conversion table */ -/* 90-9F A0-DF */ -unsigned char cv[]= { -0x21,0x21,0x21,0x23,0x21,0x56,0x21,0x57, -0x21,0x22,0x21,0x26,0x25,0x72,0x25,0x21, -0x25,0x23,0x25,0x25,0x25,0x27,0x25,0x29, -0x25,0x63,0x25,0x65,0x25,0x67,0x25,0x43, -0x21,0x3c,0x25,0x22,0x25,0x24,0x25,0x26, -0x25,0x28,0x25,0x2a,0x25,0x2b,0x25,0x2d, -0x25,0x2f,0x25,0x31,0x25,0x33,0x25,0x35, -0x25,0x37,0x25,0x39,0x25,0x3b,0x25,0x3d, -0x25,0x3f,0x25,0x41,0x25,0x44,0x25,0x46, -0x25,0x48,0x25,0x4a,0x25,0x4b,0x25,0x4c, -0x25,0x4d,0x25,0x4e,0x25,0x4f,0x25,0x52, -0x25,0x55,0x25,0x58,0x25,0x5b,0x25,0x5e, -0x25,0x5f,0x25,0x60,0x25,0x61,0x25,0x62, -0x25,0x64,0x25,0x66,0x25,0x68,0x25,0x69, -0x25,0x6a,0x25,0x6b,0x25,0x6c,0x25,0x6d, -0x25,0x6f,0x25,0x73,0x21,0x2b,0x21,0x2c, -0x00,0x00}; - - -/* X0201 kana conversion table for daguten */ -/* 90-9F A0-DF */ -unsigned char dv[]= { -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x2c,0x25,0x2e, -0x25,0x30,0x25,0x32,0x25,0x34,0x25,0x36, -0x25,0x38,0x25,0x3a,0x25,0x3c,0x25,0x3e, -0x25,0x40,0x25,0x42,0x25,0x45,0x25,0x47, -0x25,0x49,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x50,0x25,0x53, -0x25,0x56,0x25,0x59,0x25,0x5c,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00}; - -/* X0201 kana conversion table for han-daguten */ -/* 90-9F A0-DF */ -unsigned char ev[]= { -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x25,0x51,0x25,0x54, -0x25,0x57,0x25,0x5a,0x25,0x5d,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x00,0x00}; - - -/* X0208 kigou conversion table */ -/* 0x8140 - 0x819e */ -unsigned char fv[] = { - -0x00,0x00,0x00,0x00,0x2c,0x2e,0x00,0x3a, -0x3b,0x3f,0x21,0x00,0x00,0x27,0x60,0x00, -0x5e,0x00,0x5f,0x00,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x2d,0x00,0x2f, -0x5c,0x00,0x00,0x7c,0x00,0x00,0x60,0x27, -0x22,0x22,0x28,0x29,0x00,0x00,0x5b,0x5d, -0x7b,0x7d,0x3c,0x3e,0x00,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x2b,0x2d,0x00,0x00, -0x00,0x3d,0x00,0x3c,0x3e,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x24,0x00,0x00,0x25,0x23,0x26,0x2a,0x40, -0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -} ; - - -static int file_out = FALSE; -static int add_cr = FALSE; -static int del_cr = FALSE; - -#ifndef PERL_XS -int -main(argc, argv) - int argc; - char **argv; -{ - FILE *fin; - char *cp; - -#ifdef EASYWIN /*Easy Win */ - _BufferSize.y = 400;/*Set Scroll Buffer Size*/ -#endif - - for (argc--,argv++; (argc > 0) && **argv == '-'; argc--, argv++) { - cp = *argv; - arguments(cp); - } - - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - fprintf(stderr,"Mixed ISO8859/JISX0201/SJIS/EUC output is not allowed.\n"); - exit(1); - } - - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","wb",stdout) == NULL) - return (-1); -#else - setbinmode(stdout); -#endif - - if(unbuf_f) - setbuf(stdout, (char *) NULL); - else - setvbuffer(stdout, stdobuf, IOBUF_SIZE); - - if(argc == 0) { - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","rb",stdin) == NULL) return (-1); -#else - setbinmode(stdin); -#endif - setvbuffer(stdin, stdibuf, IOBUF_SIZE); - if(nop_f) - noconvert(stdin); - else - kanji_convert(stdin); - } else { - while (argc--) { - if((fin = fopen(*argv++, "r")) == NULL) { - perror(*--argv); - return(-1); - } else { -/* reopen file for stdout */ - if(file_out == TRUE){ - if(argc == 1 ) { - if(freopen(*argv++, "w", stdout) == NULL) { - perror(*--argv); - return (-1); - } - argc--; - } else { - if(freopen("nkf.out", "w", stdout) == NULL) { - perror(*--argv); - return (-1); - } - } - if(binmode_f == TRUE) { -#ifdef __OS2__ - if(freopen("","wb",stdout) == NULL) - return (-1); -#else - setbinmode(stdout); -#endif - } - } - if(binmode_f == TRUE) -#ifdef __OS2__ - if(freopen("","rb",fin) == NULL) - return (-1); -#else - setbinmode(fin); -#endif - setvbuffer(fin, stdibuf, IOBUF_SIZE); - if(nop_f) - noconvert(fin); - else - kanji_convert(fin); - fclose(fin); - } - } - } -#ifdef EASYWIN /*Easy Win */ - if(file_out == FALSE) - scanf("%d",&end_check); - else - fclose(stdout); -#else /* for Other OS */ - if(file_out == TRUE) - fclose(stdout); -#endif - return (0); -} -#endif - -static void -arguments(cp) - char *cp; -{ - while (*cp) { - switch (*cp++) { - case 'b': /* buffered mode */ - unbuf_f = FALSE; - continue; - case 'u': /* non bufferd mode */ - unbuf_f = TRUE; - continue; - case 't': /* transparent mode */ - nop_f = TRUE; - continue; - case 'j': /* JIS output */ - case 'n': - oconv = j_oconv; - continue; - case 'e': /* AT&T EUC output */ - oconv = e_oconv; - continue; - case 's': /* SJIS output */ - oconv = s_oconv; - continue; - case 'l': /* ISO8859 Latin-1 support, no conversion */ - iso8859_f = TRUE; /* Only compatible with ISO-2022-JP */ - input_f = LATIN1_INPUT; - continue; - case 'i': /* Kanji IN ESC-$-@/B */ - if(*cp=='@'||*cp=='B') - kanji_intro = *cp++; - continue; - case 'o': /* ASCII IN ESC-(-J/B */ - if(*cp=='J'||*cp=='B'||*cp=='H') - ascii_intro = *cp++; - continue; - case 'r': - rot_f = TRUE; - continue; -#if defined(MSDOS) || defined(__OS2__) - case 'T': - binmode_f = FALSE; - continue; -#endif -#ifndef PERL_XS - case 'v': - usage(); - exit(1); - break; -#endif - /* Input code assumption */ - case 'J': /* JIS input */ - case 'E': /* AT&T EUC input */ - input_f = JIS_INPUT; - continue; - case 'S': /* MS Kanji input */ - input_f = SJIS_INPUT; - if(x0201_f==NO_X0201) x0201_f=TRUE; - continue; - case 'Z': /* Convert X0208 alphabet to asii */ - /* bit:0 Convert X0208 - bit:1 Convert Kankaku to one space - bit:2 Convert Kankaku to two spaces - */ - if('9'>= *cp && *cp>='0') - alpha_f |= 1<<(*cp++ -'0'); - else - alpha_f |= TRUE; - continue; - case 'x': /* Convert X0201 kana to X0208 or X0201 Conversion */ - x0201_f = FALSE; /* No X0201->X0208 conversion */ - /* accept X0201 - ESC-(-I in JIS, EUC, MS Kanji - SI/SO in JIS, EUC, MS Kanji - SSO in EUC, JIS, not in MS Kanji - MS Kanji (0xa0-0xdf) - output X0201 - ESC-(-I in JIS (0x20-0x5f) - SSO in EUC (0xa0-0xdf) - 0xa0-0xd in MS Kanji (0xa0-0xdf) - */ - continue; - case 'X': /* Assume X0201 kana */ - /* Default value is NO_X0201 for EUC/MS-Kanji mix */ - x0201_f = TRUE; - continue; - case 'f': /* folding -f60 or -f */ - fold_f = TRUE; - fold_len = atoi(cp); - if(!(0<fold_len && fold_len<BUFSIZ)) - fold_len = DEFAULT_FOLD; - while('0'<= *cp && *cp <='9') cp++; - continue; - case 'm': /* MIME support */ - mime_f = TRUE; - if(*cp=='B'||*cp=='Q') { - mime_mode = *cp++; - mimebuf_f = FIXED_MIME; - } else if (*cp=='0') { - mime_f = FALSE; - } - continue; - case 'M': /* MIME output */ - oconv = j_oconv; /* sorry... not yet done.. */ - continue; - case 'B': /* Broken JIS support */ - /* bit:0 no ESC JIS - bit:1 allow any x on ESC-(-x or ESC-$-x - bit:2 reset to ascii on NL - */ - if('9'>= *cp && *cp>='0') - broken_f |= 1<<(*cp++ -'0'); - else - broken_f |= TRUE; - continue; -#ifndef PERL_XS - case 'O':/* for Output file */ - file_out = TRUE; - continue; -#endif - case 'c':/* add cr code */ - add_cr = TRUE; - continue; - case 'd':/* delete cr code */ - del_cr = TRUE; - continue; - default: - /* bogus option but ignored */ - continue; - } - } -} - -#ifndef PERL_XS -static void -noconvert(f) - FILE *f; -{ - int c; - - while ((c = getc(f)) != EOF) - putchar(c); -} -#endif - - -static void -kanji_convert(f) - FILE *f; -{ - int c1, c2; - - c2 = 0; - - if(input_f == JIS_INPUT || input_f == LATIN1_INPUT) { - estab_f = TRUE; iconv = oconv; - } else if(input_f == SJIS_INPUT) { - estab_f = TRUE; iconv = s_iconv; - } else { - estab_f = FALSE; iconv = oconv; - } - input_mode = ASCII; - output_mode = ASCII; - shift_mode = FALSE; - -#define NEXT continue /* no output, get next */ -#define SEND ; /* output c1 and c2, get next */ -#define LAST break /* end of loop, go closing */ - - while ((c1 = GETC(f)) != EOF) { - if(c2) { - /* second byte */ - if(c2 > DEL) { - /* in case of 8th bit is on */ - if(!estab_f) { - /* in case of not established yet */ - if(c1 > SSP) { - /* It is still ambiguious */ - h_conv(f, c2, c1); - c2 = 0; - NEXT; - } else if(c1 < AT) { - /* ignore bogus code */ - c2 = 0; - NEXT; - } else { - /* established */ - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - SEND; - } - } else - /* in case of already established */ - if(c1 < AT) { - /* ignore bogus code */ - c2 = 0; - NEXT; - } else - SEND; - } else - /* 7 bit code */ - /* it might be kanji shitfted */ - if((c1 == DEL) || (c1 <= SPACE)) { - /* ignore bogus first code */ - c2 = 0; - NEXT; - } else - SEND; - } else { - /* first byte */ - if(c1 > DEL) { - /* 8 bit code */ - if(!estab_f && !iso8859_f) { - /* not established yet */ - if(c1 < SSP) { - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - } else if(c1 < 0xe0) { - /* it seems to be EUC */ - estab_f = TRUE; - iconv = oconv; - } else { - /* still ambiguious */ - } - c2 = c1; - NEXT; - } else { /* estab_f==TRUE */ - if(iso8859_f) { - SEND; - } else if(SSP<=c1 && c1<0xe0 && iconv == s_iconv) { - /* SJIS X0201 Case... */ - /* This is too arrogant, but ... */ - if(x0201_f==NO_X0201) { - iconv = oconv; - c2 = c1; - NEXT; - } else - if(x0201_f) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - /* look ahead for X0201/X0208conversion */ - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } else if(c2==(0xde)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } - UNGETC(c2,f); c2 = 0; - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1==SSO && iconv != s_iconv) { - /* EUC X0201 Case */ - /* This is too arrogant - if(x0201_f == NO_X0201) { - estab_f = FALSE; - c2 = 0; - NEXT; - } */ - c1 = GETC(f); /* skip SSO */ - euc_1byte_check: - if(x0201_f && SSP<=c1 && c1<0xe0) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } - /* forward lookup $BByE@(B/$BH>ByE@(B */ - if(c2 != SSO) { - UNGETC(c2,f); c2 = 0; - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - (*oconv)(0,SSO); - LAST; - } else if(c2==(0xde)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - /* we have to check this c2 */ - /* and no way to push back SSO */ - c1 = c2; c2 = 0; - goto euc_1byte_check; - } - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1 < SSP && iconv != s_iconv) { - /* strange code in EUC */ - iconv = s_iconv; /* try SJIS */ - c2 = c1; - NEXT; - } else { - /* already established */ - c2 = c1; - NEXT; - } - } - } else if((c1 > SPACE) && (c1 != DEL)) { - /* in case of Roman characters */ - if(shift_mode) { - c1 |= 0x80; - /* output 1 shifted byte */ - if(x0201_f && (!iso8859_f||input_mode==X0201) && - SSP<=c1 && c1<0xe0 ) { - if(dv[(c1-SSP)*2]||ev[(c1-SSP)*2]) { - if((c2 = GETC(f)) == EOF) { - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - LAST; - } else if(c2==(0xde&0x7f)) { /* $BByE@(B */ - (*oconv)(dv[(c1-SSP)*2],dv[(c1-SSP)*2+1]); - c2=0; - NEXT; - } else if(c2==(0xdf&0x7f)&&ev[(c1-SSP)*2]) { - /* $BH>ByE@(B */ - (*oconv)(ev[(c1-SSP)*2],ev[(c1-SSP)*2+1]); - c2=0; - NEXT; - } - UNGETC(c2,f); c2 = 0; - } - (*oconv)(cv[(c1-SSP)*2],cv[(c1-SSP)*2+1]); - NEXT; - } else - SEND; - } else if(c1 == '(' && broken_f && input_mode == X0208 - && !mime_mode ) { - /* Try to recover missing escape */ - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, '('); - LAST; - } else { - if(c1 == 'B' || c1 == 'J' || c1 == 'H') { - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, '('); - /* do not modify various input_mode */ - /* It can be vt100 sequence */ - SEND; - } - } - } else if(input_mode == X0208) { - /* in case of Kanji shifted */ - c2 = c1; - NEXT; - /* goto next_byte */ - } else if(c1 == '=' && mime_f && !mime_mode ) { - if((c1 = getc(f)) == EOF) { - (*oconv)(0, '='); - LAST; - } else if(c1 == '?') { - /* =? is mime conversiooon start sequence */ - if(mime_begin(f) == EOF) /* check in detail */ - LAST; - else - NEXT; - } else { - (*oconv)(0, '='); - ungetc(c1,f); - NEXT; - } - } else if(c1 == '$' && broken_f && !mime_mode) { - /* try to recover missing escape */ - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, '$'); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* in case of Kanji in ESC sequence */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - /* sorry */ - (*oconv)(0, '$'); - (*oconv)(0, c1); - NEXT; - } - } else - SEND; - } else if(c1 == SI) { - shift_mode = FALSE; - NEXT; - } else if(c1 == SO) { - shift_mode = TRUE; - NEXT; - } else if(c1 == ESC) { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - LAST; - } else if(c1 == '$') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else if(c1 == '(') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - LAST; - } else if(c1 == '@'|| c1 == 'B') { - /* This is kanji introduction */ - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, '('); - (*oconv)(0, c1); - NEXT; - } - } else if(broken_f&0x2) { - input_mode = X0208; - shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '$'); - (*oconv)(0, c1); - NEXT; - } - } else if(c1 == '(') { - if((c1 = GETC(f)) == EOF) { - (*oconv)(0, ESC); - (*oconv)(0, '('); - LAST; - } else { - if(c1 == 'I') { - /* This is X0201 kana introduction */ - input_mode = X0201; shift_mode = X0201; - NEXT; - } else if(c1 == 'B' || c1 == 'J' || c1 == 'H') { - /* This is X0208 kanji introduction */ - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else if(broken_f&0x2) { - input_mode = ASCII; shift_mode = FALSE; - NEXT; - } else { - (*oconv)(0, ESC); - (*oconv)(0, '('); - /* maintain various input_mode here */ - SEND; - } - } - } else { - /* lonely ESC */ - (*oconv)(0, ESC); - SEND; - } - } else if(c1 == NL && broken_f&4) { - input_mode = ASCII; - SEND; - } else - SEND; - } - /* send: */ - if(input_mode == X0208) - (*oconv)(c2, c1); /* this is JIS, not SJIS/EUC case */ - else - (*iconv)(c2, c1); /* can be EUC/SJIS */ - c2 = 0; - continue; - /* goto next_word */ - } - - /* epilogue */ - (*iconv)(EOF, 0); -} - - - - -static void -h_conv(f, c2, c1) - FILE *f; - int c1, c2; -{ - int wc; - - - /** it must NOT be in the kanji shifte sequence */ - /** it must NOT be written in JIS7 */ - /** and it must be after 2 byte 8bit code */ - - hold_count = 0; - push_hold_buf(c2, c1); - c2 = 0; - - while ((c1 = GETC(f)) != EOF) { - if(c2) { - /* second byte */ - if(!estab_f) { - /* not established */ - if(c1 > SSP) { - /* it is still ambiguious yet */ - SEND; - } else if(c1 < AT) { - /* ignore bogus first byte */ - c2 = 0; - SEND; - } else { - /* now established */ - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - SEND; - } - } else - SEND; - } else { - /* First byte */ - if(c1 > DEL) { - /* 8th bit is on */ - if(c1 < SSP) { - /* it seems to be MS Kanji */ - estab_f = TRUE; - iconv = s_iconv; - } else if(c1 < 0xe0) { - /* it seems to be EUC */ - estab_f = TRUE; - iconv = oconv; - } else { - /* still ambiguious */ - } - c2 = c1; - NEXT; - } else - /* 7 bit code , then send without any process */ - SEND; - } - /* send: */ - if((push_hold_buf(c2, c1) == EOF) || estab_f) - break; - c2 = 0; - continue; - } - - /** now, - ** 1) EOF is detected, or - ** 2) Code is established, or - ** 3) Buffer is FULL (but last word is pushed) - ** - ** in 1) and 3) cases, we continue to use - ** Kanji codes by oconv and leave estab_f unchanged. - **/ - - for (wc = 0; wc < hold_count; wc += 2) { - c2 = hold_buf[wc]; - c1 = hold_buf[wc+1]; - (*iconv)(c2, c1); - } - return; -} - - - -static int -push_hold_buf(c2, c1) - int c2, c1; -{ - if(hold_count >= HOLD_SIZE*2) - return (EOF); - hold_buf[hold_count++] = c2; - hold_buf[hold_count++] = c1; - return ((hold_count >= HOLD_SIZE*2) ? EOF : hold_count); -} - - -static void -s_iconv(c2, c1) - int c2, - c1; -{ - if((c2 == EOF) || (c2 == 0)) { - /* NOP */ - } else { - c2 = c2 + c2 - ((c2 <= 0x9f) ? SJ0162 : SJ6394); - if(c1 < 0x9f) - c1 = c1 - ((c1 > DEL) ? SPACE : 0x1f); - else { - c1 = c1 - 0x7e; - c2++; - } - } - (*oconv)(c2, c1); -} - - -static void -e_oconv(c2, c1) - int c2, c1; -{ - c2 = pre_convert(c1,c2); c1 = c1_return; - if(fold_f) { - switch(line_fold(c2,c1)) { - case '\n': - if(add_cr == TRUE) { - putchar('\r'); - c1 = '\n'; - } - putchar('\n'); - break; - case 0: return; - case '\r': - c1 = '\n'; c2 = 0; - break; - case '\t': - case ' ': - c1 = ' '; c2 = 0; - break; - } - } - if(c2==DOUBLE_SPACE) { - putchar(' '); putchar(' '); - return; - } - if(c2 == EOF) - return; - else if(c2 == 0 && (c1&0x80)) { - putchar(SSO); putchar(c1); - } else if(c2 == 0) { - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else { - if((c1<0x20 || 0x7e<c1) || - (c2<0x20 || 0x7e<c2)) { - estab_f = FALSE; - return; /* too late to rescue this char */ - } - putchar(c2 | 0x080); - putchar(c1 | 0x080); - } - return; -} - - -static void -s_oconv(c2, c1) - int c2, c1; -{ - c2 = pre_convert(c1,c2); c1 = c1_return; - if(fold_f) { - switch(line_fold(c2,c1)) { - case '\n': - if(add_cr == TRUE) { - putchar('\r'); - c1 = '\n'; - } - putchar('\n'); - break; - case '\r': - c1 = '\n'; c2 = 0; - break; - case 0: return; - case '\t': - case ' ': - c1 = ' '; c2 = 0; - break; - } - } - if(c2==DOUBLE_SPACE) { - putchar(' '); putchar(' '); - return; - } - if(c2 == EOF) - return; - else if(c2 == 0) { - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else { - if((c1<0x20 || 0x7e<c1) || - (c2<0x20 || 0x7e<c2)) { - estab_f = FALSE; - return; /* too late to rescue this char */ - } - putchar((((c2 - 1) >> 1) + ((c2 <= 0x5e) ? 0x71 : 0xb1))); - putchar((c1 + ((c2 & 1) ? ((c1 < 0x60) ? 0x1f : 0x20) : 0x7e))); - } - return; -} - - -static void -j_oconv(c2, c1) - int c2, c1; -{ - c2 = pre_convert(c1,c2); c1 = c1_return; - if(fold_f) { - switch(line_fold(c2,c1)) { - case '\n': - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - } - if(add_cr == TRUE) { - putchar('\r'); - c1 = '\n'; - } - putchar('\n'); - output_mode = ASCII; - break; - case '\r': - c1 = '\n'; c2 = 0; - break; - case '\t': - case ' ': - c1 = ' '; c2 = 0; - break; - case 0: return; - } - } - if(c2 == EOF) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - } - } else if(c2 == 0 && (c1 & 0x80)) { - if(input_mode==X0201 || !iso8859_f) { - if(output_mode!=X0201) { - putchar(ESC); - putchar('('); - putchar('I'); - output_mode = X0201; - } - c1 &= 0x7f; - } else { - /* iso8859 introduction, or 8th bit on */ - /* Can we convert in 7bit form using ESC-'-'-A ? - Is this popular? */ - } - putchar(c1); - } else if(c2 == 0) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - output_mode = ASCII; - } - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else if(c2 == DOUBLE_SPACE) { - if(output_mode) { - putchar(ESC); - putchar('('); - putchar(ascii_intro); - output_mode = ASCII; - } - putchar(' '); - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } else { - if(output_mode != X0208) { - putchar(ESC); - putchar('$'); - putchar(kanji_intro); - output_mode = X0208; - } - if(c1<0x20 || 0x7e<c1) - return; - if(c2<0x20 || 0x7e<c2) - return; - putchar(c2); - if(c1 == '\n' && add_cr == TRUE) - putchar('\r'); - if(c1 != '\r') - putchar(c1); - else if(del_cr == FALSE) - putchar(c1); - } - return; -} - - - -#define rot13(c) ( \ - ( c < 'A' ) ? c: \ - (c <= 'M') ? (c + 13): \ - (c <= 'Z') ? (c - 13): \ - (c < 'a') ? (c): \ - (c <= 'm') ? (c + 13): \ - (c <= 'z') ? (c - 13): \ - (c) \ -) - -#define rot47(c) ( \ - ( c < '!' ) ? c: \ - ( c <= 'O' ) ? (c + 47) : \ - ( c <= '~' ) ? (c - 47) : \ - c \ -) - - -/* - Return value of line_fold() - - \n add newline and output char - \r add newline and output nothing - ' ' space - 0 skip - 1 (or else) normal output - - fold state in prev (previous character) - - >0x80 Japanese (X0208/X0201) - <0x80 ASCII - \n new line - ' ' space - - This fold algorthm does not preserve heading space in a line. - This is the main difference from fmt. -*/ - -static int -line_fold(c2,c1) -int c2,c1; -{ - int prev0; - if(c1=='\r') - return 0; /* ignore cr */ - if(c1== 8) { - if(line>0) line--; - return 1; - } - if(c2==EOF && line != 0) /* close open last line */ - return '\n'; - /* new line */ - if(c1=='\n') { - if(prev == c1) { /* duplicate newline */ - if(line) { - line = 0; - return '\n'; /* output two newline */ - } else { - line = 0; - return 1; - } - } else { - if(prev&0x80) { /* Japanese? */ - prev = c1; - return 0; /* ignore given single newline */ - } else if(prev==' ') { - return 0; - } else { - prev = c1; - if(++line<=fold_len) - return ' '; - else { - line = 0; - return '\r'; /* fold and output nothing */ - } - } - } - } - if(c1=='\f') { - prev = '\n'; - if(line==0) - return 1; - line = 0; - return '\n'; /* output newline and clear */ - } - /* X0208 kankaku or ascii space */ - if( (c2==0&&c1==' ')|| - (c2==0&&c1=='\t')|| - (c2==DOUBLE_SPACE)|| - (c2=='!'&& c1=='!')) { - if(prev == ' ') { - return 0; /* remove duplicate spaces */ - } - prev = ' '; - if(++line<=fold_len) - return ' '; /* output ASCII space only */ - else { - prev = ' '; line = 0; - return '\r'; /* fold and output nothing */ - } - } - prev0 = prev; /* we still need this one... , but almost done */ - prev = c1; - if(c2 || (SSP<=c1 && c1<=0xdf)) - prev |= 0x80; /* this is Japanese */ - line += (c2==0)?1:2; - if(line<=fold_len) { /* normal case */ - return 1; - } - if(line>=fold_len+FOLD_MARGIN) { /* too many kinsou suspension */ - line = (c2==0)?1:2; - return '\n'; /* We can't wait, do fold now */ - } - /* simple kinsoku rules return 1 means no folding */ - if(c2==0) { - if(c1==0xde) return 1; /* $B!+(B*/ - if(c1==0xdf) return 1; /* $B!,(B*/ - if(c1==0xa4) return 1; /* $B!#(B*/ - if(c1==0xa3) return 1; /* $B!$(B*/ - if(c1==0xa1) return 1; /* $B!W(B*/ - if(c1==0xb0) return 1; /* - */ - if(SSP<=c1 && c1<=0xdf) { /* X0201 */ - line = 1; - return '\n';/* add one new line before this character */ - } - /* fold point in ASCII { [ ( */ - if(( c1!=')'&& - c1!=']'&& - c1!='}'&& - c1!='.'&& - c1!=','&& - c1!='!'&& - c1!='?'&& - c1!='/'&& - c1!=':'&& - c1!=';')&& - ((prev0=='\n')|| (prev0==' ')|| /* ignored new line */ - (prev0&0x80)) /* X0208 - ASCII */ - ) { - line = 1; - return '\n';/* add one new line before this character */ - } - return 1; /* default no fold in ASCII */ - } else { - if(c2=='!') { - if(c1=='"') return 1; /* $B!"(B */ - if(c1=='#') return 1; /* $B!#(B */ - if(c1=='$') return 1; /* $B!$(B */ - if(c1=='%') return 1; /* $B!%(B */ - if(c1=='\'') return 1; /* $B!\(B */ - if(c1=='(') return 1; /* $B!((B */ - if(c1==')') return 1; /* $B!)(B */ - if(c1=='*') return 1; /* $B!*(B */ - if(c1=='+') return 1; /* $B!+(B */ - if(c1==',') return 1; /* $B!,(B */ - } - line = 2; - return '\n'; /* add one new line before this character */ - } -} - -static int -pre_convert(c1,c2) - int c1,c2; -{ - if(c2) c1 &= 0x7f; - c1_return = c1; - if(c2==EOF) return c2; - c2 &= 0x7f; - if(rot_f) { - if(c2) { - c1 = rot47(c1); - c2 = rot47(c2); - } else { - if(!(c1 & 0x80)) - c1 = rot13(c1); - } - c1_return = c1; - } - /* JISX0208 Alphabet */ - if(alpha_f && c2 == 0x23 ) return 0; - /* JISX0208 Kigou */ - if(alpha_f && c2 == 0x21 ) { - if(0x21==c1) { - if(alpha_f&0x2) { - c1_return = ' '; - return 0; - } else if(alpha_f&0x4) { - c1_return = ' '; - return DOUBLE_SPACE; - } else { - return c2; - } - } else if(0x20<c1 && c1<0x7f && fv[c1-0x20]) { - c1_return = fv[c1-0x20]; - return 0; - } - } - return c2; -} - - -#ifdef STRICT_MIME -/* This converts */ - -unsigned char *mime_pattern[] = { - (unsigned char *)"\075?ISO-8859-1?Q?", - (unsigned char *)"\075?ISO-2022-JP?B?", - (unsigned char *)"\075?ISO-2022-JP?Q?", - (unsigned char *)"\075?JAPANESE_EUC?B?", - (unsigned char *)"\075?SHIFT_JIS?B?", - NULL -}; - -int mime_encode[] = { - 'Q', 'B', 'Q', - 0 -}; -#endif - -#define MAXRECOVER 20 -int iso8859_f_save; - -#ifdef STRICT_MIME - -#define nkf_toupper(c) (('a'<=c && c<='z')?(c-('a'-'A')):c) -/* I don't trust portablity of toupper */ - -static int -mime_begin(f) - FILE *f; -{ - int c1; - int i,j,k; - unsigned char *p,*q; - int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */ - - mime_mode = FALSE; - /* =? has been checked */ - j = 0; - p = mime_pattern[j]; - r[0]='='; r[1]='?'; - - for(i=2;p[i]>' ';i++) { /* start at =? */ - if( ((((r[i] = c1 = getc(f))==EOF) || nkf_toupper(c1) != p[i] ) { - /* pattern fails, try next one */ - q = p; - while (p = mime_pattern[++j]) { - for(k=2;k<i;k++) /* assume length(p) > i */ - if(p[k]!=q[k]) break; - if(k==i && nkf_toupper(c1)==p[k]) break; - } - if(p) continue; /* found next one, continue */ - /* all fails, output from recovery buffer */ - ungetc(c1,f); - for(j=0;j<i;j++) { - (*oconv)(0,r[j]); - } - return c1; - } - } - mime_mode = mime_encode[j]; - iso8859_f_save = iso8859_f; - if(j==0) { - iso8859_f = TRUE; - } - if(mime_mode=='B') { - mimebuf_f = unbuf_f; - if(!unbuf_f) { - /* do MIME integrity check */ - return mime_integrity(f,mime_pattern[j]); - } - } - mimebuf_f = TRUE; - return c1; -} - -#define mime_getc0(f) (mimebuf_f?getc(f):Fifo(mime_input++)) -#define mime_ungetc0(c,f) (mimebuf_f?ungetc(c,f):mime_input--) - -#else -static int -mime_begin(f) -FILE *f; -{ - int c1; - int i,j; - int r[MAXRECOVER]; /* recovery buffer, max mime pattern lenght */ - - mime_mode = FALSE; - /* =? has been checked */ - j = 0; - r[0]='='; r[1]='?'; - for(i=2;i<MAXRECOVER;i++) { /* start at =? */ - /* We accept any charcter type even if it is breaked by new lines */ - if( (r[i] = c1 = getc(f))==EOF) break; - if(c1=='=') break; - if(c1<' '&& c1!='\r' && c1!='\n') break; - if(c1=='?') { - i++; - if(!(i<MAXRECOVER) || (r[i] = c1 = getc(f))==EOF) break; - if(c1=='b'||c1=='B') { - mime_mode = 'B'; - } else if(c1=='q'||c1=='Q') { - mime_mode = 'Q'; - } else { - break; - } - i++; - if(!(i<MAXRECOVER) || (r[i] = c1 = getc(f))==EOF) break; - if(c1=='?') { - break; - } else { - mime_mode = FALSE; - } - break; - } - } - if(!mime_mode || c1==EOF || i==MAXRECOVER) { - ungetc(c1,f); - if (i == MAXRECOVER) - i--; - for(j=0;j<i;j++) { - (*oconv)(0,r[j]); - } - return c1; - } - iso8859_f_save = iso8859_f; - /* do no MIME integrity check */ - return c1; /* used only for checking EOF */ -} - -#define mime_getc0(f) getc(f) -#define mime_ungetc0(c,f) ungetc(c,f) - -#endif - -static int -mime_getc(f) - FILE *f; -{ - int c1, c2, c3, c4, cc; - int t1, t2, t3, t4, mode, exit_mode; - - if(mime_top != mime_last) { /* Something is in FIFO */ - return Fifo(mime_top++); - } - - if(mimebuf_f == FIXED_MIME) - exit_mode = mime_mode; - else - exit_mode = FALSE; - if(mime_mode == 'Q') { - if((c1 = mime_getc0(f)) == EOF) return (EOF); - if(c1=='_') return ' '; - if(c1!='=' && c1!='?') - return c1; - mime_mode = exit_mode; /* prepare for quit */ - if(c1<=' ') return c1; - if((c2 = mime_getc0(f)) == EOF) return (EOF); - if(c2<=' ') return c2; - if(c1=='?'&&c2=='=') { - /* end Q encoding */ - input_mode = exit_mode; - iso8859_f = iso8859_f_save; - return getc(f); - } - if(c1=='?') { - mime_mode = 'Q'; /* still in MIME */ - mime_ungetc0(c2,f); - return c1; - } - if((c3 = mime_getc0(f)) == EOF) return (EOF); - if(c2<=' ') return c2; - mime_mode = 'Q'; /* still in MIME */ -#define hex(c) (('0'<=c&&c<='9')?(c-'0'):\ - ('A'<=c&&c<='F')?(c-'A'+10):('a'<=c&&c<='f')?(c-'a'+10):0) - return ((hex(c2)<<4) + hex(c3)); - } - - if(mime_mode != 'B') { - mime_mode = FALSE; - return getc(f); - } - - - /* Base64 encoding */ - /* - MIME allows line break in the middle of - Base64, but we are very pessimistic in decoding - in unbuf mode because MIME encoded code may broken by - less or editor's control sequence (such as ESC-[-K in unbuffered - mode. ignore incomplete MIME. - */ - mode = mime_mode; - mime_mode = exit_mode; /* prepare for quit */ - - while ((c1 = mime_getc0(f))<=' ') { - if(c1==EOF) - return (EOF); - } - if((c2 = mime_getc0(f))<=' ') { - if(c2==EOF) - return (EOF); - if(mimebuf_f!=FIXED_MIME) input_mode = ASCII; - return c2; - } - if((c1 == '?') && (c2 == '=')) { - input_mode = ASCII; - while((c1 = getc(f))==' ' /* || c1=='\n' || c1=='\r' */); - return c1; - } - if((c3 = mime_getc0(f))<=' ') { - if(c3==EOF) - return (EOF); - if(mimebuf_f!=FIXED_MIME) input_mode = ASCII; - return c3; - } - if((c4 = mime_getc0(f))<=' ') { - if(c4==EOF) - return (EOF); - if(mimebuf_f!=FIXED_MIME) input_mode = ASCII; - return c4; - } - - mime_mode = mode; /* still in MIME sigh... */ - - /* BASE 64 decoding */ - - t1 = 0x3f & base64decode(c1); - t2 = 0x3f & base64decode(c2); - t3 = 0x3f & base64decode(c3); - t4 = 0x3f & base64decode(c4); - cc = ((t1 << 2) & 0x0fc) | ((t2 >> 4) & 0x03); - if(c2 != '=') { - Fifo(mime_last++) = cc; - cc = ((t2 << 4) & 0x0f0) | ((t3 >> 2) & 0x0f); - if(c3 != '=') { - Fifo(mime_last++) = cc; - cc = ((t3 << 6) & 0x0c0) | (t4 & 0x3f); - if(c4 != '=') - Fifo(mime_last++) = cc; - } - } else { - return c1; - } - return Fifo(mime_top++); -} - -static int -mime_ungetc(c) - unsigned int c; -{ - Fifo(mime_last++) = c; - return c; -} - -#ifdef STRICT_MIME -int -mime_integrity(f,p) - FILE *f; - unsigned char *p; -{ - int c,d; - unsigned int q; - /* In buffered mode, read until =? or NL or buffer fffull - */ - mime_input = mime_top; - mime_last = mime_top; - while(*p) Fifo(mime_input++) = *p++; - d = 0; - q = mime_input; - while((c=getc(f))!=EOF) { - if(((mime_input-mime_top)&MIME_BUF_MASK)==0) break; - if(c=='=' && d=='?') { - /* checked. skip header, start decode */ - Fifo(mime_input++) = c; - mime_input = q; - return 1; - } - if(!( (c=='+'||c=='/'|| c=='=' || c=='?' || - ('a'<=c && c<='z')||('A'<= c && c<='Z')||('0'<=c && c<='9')))) - break; - /* Should we check length mod 4? */ - Fifo(mime_input++) = c; - d=c; - } - /* In case of Incomplete MIME, no MIME decode */ - Fifo(mime_input++) = c; - mime_last = mime_input; /* point undecoded buffer */ - mime_mode = 1; /* no decode on Fifo last in mime_getc */ - return 1; -} -#endif - -static int -base64decode(c) - int c; -{ - int i; - if(c > '@') - if(c < '[') - i = c - 'A'; /* A..Z 0-25 */ - else - i = c - 'G' /* - 'a' + 26 */ ; /* a..z 26-51 */ - else if(c > '/') - i = c - '0' + '4' /* - '0' + 52 */ ; /* 0..9 52-61 */ - else if(c == '+') - i = '>' /* 62 */ ; /* + 62 */ - else - i = '?' /* 63 */ ; /* / 63 */ - return (i); -} - -static void -reinit() -{ - unbuf_f = FALSE; - estab_f = FALSE; - nop_f = FALSE; - binmode_f = TRUE; - rot_f = FALSE; - input_f = FALSE; - alpha_f = FALSE; - mime_f = TRUE; - mimebuf_f = FALSE; - broken_f = FALSE; - iso8859_f = FALSE; - x0201_f = TRUE; - x0201_f = NO_X0201; - fold_f = FALSE; - kanji_intro = DEFAULT_J; - ascii_intro = DEFAULT_R; - oconv = DEFAULT_CONV; - output_mode = ASCII; - input_mode = ASCII; - shift_mode = FALSE; - mime_mode = FALSE; - file_out = FALSE; - add_cr = FALSE; - del_cr = FALSE; - line = 0; -} - -#ifndef PERL_XS -int -usage() -{ - fprintf(stderr,"USAGE: nkf(nkf32,wnkf,nkf2) -[flags] [in file] .. [out file for -O flag]\n"); - fprintf(stderr,"Flags:\n"); - fprintf(stderr,"b,u Output is bufferred (DEFAULT),Output is unbufferred\n"); -#ifdef DEFAULT_CODE_SJIS - fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS (DEFAULT), AT&T JIS (EUC)\n"); -#endif -#ifdef DEFAULT_CODE_JIS - fprintf(stderr,"j,s,e Outout code is JIS 7 bit (DEFAULT), Shift JIS, AT&T JIS (EUC)\n"); -#endif -#ifdef DEFAULT_CODE_EUC - fprintf(stderr,"j,s,e Outout code is JIS 7 bit, Shift JIS, AT&T JIS (EUC) (DEFAULT)\n"); -#endif - fprintf(stderr,"J,S,E Input assumption is JIS 7 bit , Shift JIS, AT&T JIS (EUC)\n"); - fprintf(stderr,"t no conversion\n"); - fprintf(stderr,"i_ Output sequence to designate JIS-kanji (DEFAULT B)\n"); - fprintf(stderr,"o_ Output sequence to designate ASCII (DEFAULT B)\n"); - fprintf(stderr,"r {de/en}crypt ROT13/47\n"); - fprintf(stderr,"v Show this usage\n"); - fprintf(stderr,"m[BQ0] MIME decode [B:base64,Q:quoted,0:no decode]\n"); - fprintf(stderr,"l ISO8859-1 (Latin-1) support\n"); - fprintf(stderr,"f Folding: -f60 or -f\n"); - fprintf(stderr,"Z[0-2] Convert X0208 alphabet to ASCII 1: Kankaku to space,2: 2 spaces\n"); - fprintf(stderr,"X,x Assume X0201 kana in MS-Kanji, -x preserves X0201\n"); - fprintf(stderr,"B[0-2] Broken input 0: missing ESC,1: any X on ESC-[($]-X,2: ASCII on NL\n"); -#ifdef MSDOS - fprintf(stderr,"T Text mode output\n"); -#endif - fprintf(stderr,"O Output to File (DEFAULT 'nkf.out')\n"); - fprintf(stderr,"d,c Delete \\r in line feed, Add \\r in line feed\n"); - fprintf(stderr,"Network Kanji Filter Version %s (%s) " -#if defined(MSDOS) && !defined(_Windows) - "for DOS" -#endif -#if !defined(__WIN32__) && defined(_Windows) - "for Win16" -#endif -#if defined(__WIN32__) && defined(_Windows) - "for Win32" -#endif -#ifdef __OS2__ - "for OS/2" -#endif - ,Version,Patchlevel); - fprintf(stderr,"\n%s\n",CopyRight); - return 0; -} -#endif - -/** - ** $B%Q%C%A@):n<T(B - ** void@merope.pleiades.or.jp (Kusakabe Youichi) - ** NIDE Naoyuki <nide@ics.nara-wu.ac.jp> - ** ohta@src.ricoh.co.jp (Junn Ohta) - ** inouet@strl.nhk.or.jp (Tomoyuki Inoue) - ** kiri@pulser.win.or.jp (Tetsuaki Kiriyama) - ** Kimihiko Sato <sato@sail.t.u-tokyo.ac.jp> - ** a_kuroe@kuroe.aoba.yokohama.jp (Akihiko Kuroe) - ** kono@ie.u-ryukyu.ac.jp (Shinji Kono) - ** GHG00637@nifty-serve.or.jp (COW) - ** - ** $B:G=*99?7F|(B - ** 1998.11.7 - **/ - -/* end */ diff --git a/ext/nkf/test.rb b/ext/nkf/test.rb index 4519f8ba7e..7a2390d649 100644 --- a/ext/nkf/test.rb +++ b/ext/nkf/test.rb @@ -1,3 +1,19 @@ +#!/usr/local/bin/ruby +# +# nkf test program for nkf 1.7 +# Shinji KONO <kono@ie.u-ryukyu.ac.jp> +# Sun Aug 18 12:25:40 JST 1996 +# Sun Nov 8 00:16:06 JST 1998 +# +# This is useful when you add new patch on nkf. +# Since this test is too strict, faileurs may not mean +# wrong conversion. +# +# nkf 1.5 differs on MIME decoding +# nkf 1.4 passes Basic Conversion tests +# nkf PDS version passes Basic Conversion tests using "nkf -iB -oB " +# + $counter = 0 def result(result, message = nil) $counter += 1 @@ -49,41 +65,150 @@ end $detail = false -def test(opt, input, expect) +def test(opt, input, expects) print "\nINPUT:\n", input if $detail - print "\nEXPECT:\n", expect if $detail + print "\nEXPECT:\n", expects.to_s if $detail result = nkf(opt, input) print "\nGOT:\n", result if $detail - print result == expect ? "Ok\n" : "Fail\n" - return result + expects.each do |e| + if result == e then + puts "Ok" + return result + end + end + puts "Fail" end + +example = Hash.new + # Basic Conversion -print "\nBasic Conversion test\n\n" +print "\nBasic Conversion test\n\n"; + +# I gave up simple literal quote because there are big difference +# on perl4 and perl5 on literal quote. Of course we cannot use +# jperl. -example = {} example['jis'] = <<'eofeof'.unpack('u')[0] M1FER<W0@4W1A9V4@&R1"(3DQ(3%^2R%+?D]3&RA"(%-E8V]N9"!3=&%G92`; M)$)0)TU:&RA"($AI<F%G86YA(!LD0B0B)"0D)B0H)"HD;R1R)',;*$(*2V%T M86MA;F$@&R1")2(E)"4F)2@E*B5O)7(E<QLH0B!+:6=O=2`;)$(A)B%G(S`C /029!)E@G(B=!*$`;*$(* eofeof -#' example['sjis'] = <<'eofeof'.unpack('u')[0] M1FER<W0@4W1A9V4@@5B)0(F>ED"6GIAR(%-E8V]N9"!3=&%G92"8I9=Y($AI M<F%G86YA((*@@J*"I(*F@JB"[8+P@O$*2V%T86MA;F$@@T�X-%@T>#28./ >@Y*#DR!+:6=O=2"!18&'@D^"8(._@]:$081@A+X* eofeof -#' example['euc'] = <<'eofeof'.unpack('u')[0] M1FER<W0@4W1A9V4@H;FQH;'^RZ'+_L_3(%-E8V]N9"!3=&%G92#0I\W:($AI M<F%G86YA(*2BI*2DIJ2HI*JD[Z3RI/,*2V%T86MA;F$@I:*EI*6FI:BEJJ7O >I?*E\R!+:6=O=2"AIJ'GH["CP:;!IMBGHJ?!J,`* eofeof -#' + +example['utf'] = <<'eofeof'.unpack('u')[0] +M1FER<W0@4W1A9V4@XX"%Z9FBY;^<YK.5YKJ`Z(65(%-E8V]N9"!3=&%G92#D +MN+SI@:4@2&ER86=A;F$@XX&"XX&$XX&&XX&(XX&*XX*/XX*2XX*3"DMA=&%K +M86YA(.."HN."I.."IN."J.."JN.#K^.#LN.#LR!+:6=O=2#C@[OBB)[OO)#O +.O*'.L<^)T)'0K^*5@@H` +eofeof + + +example['jis1'] = <<'eofeof'.unpack('u')[0] +M&R1";3%Q<$$L&RA""ALD0F4Z3F\;*$(*&R1"<FT;*$()&R1"/F5.3D]+&RA" +#"0D* +eofeof + +example['sjis1'] = <<'eofeof'.unpack('u')[0] +8YU#ID)%+"N-9E^T*Z>L)C^.7S)AJ"0D* +eofeof + +example['euc1'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['utf1'] = <<'eofeof'.unpack('u')[0] +AZ+J%Z:N/Z8JM"N>VNNFZEPKIM(D)Y+B*Z:"8Y+J8"0D* +eofeof + +example['jis2'] = <<'eofeof'.unpack('u')[0] ++&R1".EA&(QLH0@H` +eofeof + +example['sjis2'] = <<'eofeof'.unpack('u')[0] +%C=:3H0H` +eofeof + +example['euc2'] = <<'eofeof'.unpack('u')[0] +%NMC&HPH` +eofeof + +example['utf2'] = <<'eofeof'.unpack('u')[0] +'YI:.Z)>D"@`` +eofeof + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis'],[example['jis']]); +print "JIS to SJIS... ";test('-s',example['jis'],[example['sjis']]); +print "JIS to EUC ... ";test('-e',example['jis'],[example['euc']]); +print "JIS to UTF8... ";test('-w',example['jis'],[example['utf']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis'],[example['jis']]); +print "SJIS to SJIS... ";test('-s',example['sjis'],[example['sjis']]); +print "SJIS to EUC ... ";test('-e',example['sjis'],[example['euc']]); +print "SJIS to UTF8... ";test('-w',example['sjis'],[example['utf']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc'],[example['jis']]); +print "EUC to SJIS... ";test('-s',example['euc'],[example['sjis']]); +print "EUC to EUC ... ";test('-e',example['euc'],[example['euc']]); +print "EUC to UTF8... ";test('-w',example['euc'],[example['utf']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf'],[example['jis']]); +print "UTF8 to SJIS... ";test('-s',example['utf'],[example['sjis']]); +print "UTF8 to EUC ... ";test('-e',example['utf'],[example['euc']]); +print "UTF8 to UTF8... ";test('-w',example['utf'],[example['utf']]); + + + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis1'],[example['jis1']]); +print "JIS to SJIS... ";test('-s',example['jis1'],[example['sjis1']]); +print "JIS to EUC ... ";test('-e',example['jis1'],[example['euc1']]); +print "JIS to UTF8... ";test('-w',example['jis1'],[example['utf1']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis1'],[example['jis1']]); +print "SJIS to SJIS... ";test('-s',example['sjis1'],[example['sjis1']]); +print "SJIS to EUC ... ";test('-e',example['sjis1'],[example['euc1']]); +print "SJIS to UTF8... ";test('-w',example['sjis1'],[example['utf1']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc1'],[example['jis1']]); +print "EUC to SJIS... ";test('-s',example['euc1'],[example['sjis1']]); +print "EUC to EUC ... ";test('-e',example['euc1'],[example['euc1']]); +print "EUC to UTF8... ";test('-w',example['euc1'],[example['utf1']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf1'],[example['jis1']]); +print "UTF8 to SJIS... ";test('-s',example['utf1'],[example['sjis1']]); +print "UTF8 to EUC ... ";test('-e',example['utf1'],[example['euc1']]); +print "UTF8 to UTF8... ";test('-w',example['utf1'],[example['utf1']]); + +# Ambigous Case example['amb'] = <<'eofeof'.unpack('u')[0] MI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&E @@ -117,6 +242,31 @@ M)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q >)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(* eofeof +print "Ambiguous Case. "; + test('-j',example['amb'],[example['amb.euc']]); + +# Input assumption + +print "SJIS Input assumption "; + test('-jSx',example['amb'],[example['amb.sjis']]); + +# Broken JIS + +print "Broken JIS "; + $input = example['jis']; + $input.gsub("\033",''); + test('-Be',$input,[example['euc']]); +print "Broken JIS is safe on Normal JIS? "; + $input = example['jis']; + test('-Be',$input,[example['euc']]); + +# X0201 仮名 +# X0201->X0208 conversion +# X0208 aphabet -> ASCII +# X0201 相互変換 + +print "\nX0201 test\n\n"; + example['x0201.sjis'] = <<'eofeof'.unpack('u')[0] MD5.*<(-*@TR#3H-0@U*#2X--@T^#48-3"I%3B7""8()A@F*"8X)D@F6"9H*! M@H*"@X*$@H6"AH*'"I%3BTR-AH%)@9>!E(&0@9.!3X&5@9:!:8%J@7R!>X&! @@ -124,7 +274,6 @@ M@6V!;H%O@7"!CPJ4O(IPMK>X/;FZMMZWWKC>N=ZZWH+&"I2\BG#*W\O?S-_- MW\[?M]^QW@K*W\O?S`IH86YK86MU(,K?R]_,I`K*W\O?S-VA"I2\BG""S(SC !"@!" eofeof -#' example['x0201.euc'] = <<'eofeof'.unpack('u')[0] MP;2ST:6KI:VEKZ6QI;.EK*6NI;"ELJ6T"L&TL=&CP:/"H\.CQ*/%H\:CQZ/A @@ -134,7 +283,17 @@ MWJ3("LB^L]&.RH[?CLN.WX[,CM^.S8[?CLZ.WXZWCM^.L8[>"H[*CM^.RX[? MCLP*:&%N:V%K=2".RH[?CLN.WX[,CJ0*CLJ.WX[+CM^.S([=CJ$*R+ZST:3. #N.4* eofeof -#' + +example['x0201.utf'] = <<'eofeof'.unpack('u')[0] +MY86HZ*>2XX*KXX*MXX*OXX*QXX*SXX*LXX*NXX*PXX*RXX*T"N6%J.B+L>^\ +MH>^\HN^\H^^\I.^\I>^\IN^\I^^]@>^]@N^]@^^]A.^]A>^]AN^]APKEA:CH +MJ)CEC[?OO('OO*#OO(/OO(3OO(7OO+[OO(;OO(KOO(COO(GBB)+OO(OOO)WO +MO+OOO+WOO9OOO9WOOZ4*Y8V*Z*>2[[VV[[VW[[VX/>^]N>^]NN^]MN^^GN^] +MM^^^GN^]N.^^GN^]N>^^GN^]NN^^GN.!J`KEC8KHIY+OOHKOOI_OOHOOOI_O +MOHSOOI_OOHWOOI_OOH[OOI_OO;?OOI_OO;'OOIX*[[Z*[[Z?[[Z+[[Z?[[Z, +M"FAA;FMA:W4@[[Z*[[Z?[[Z+[[Z?[[Z,[[VD"N^^BN^^G^^^B^^^G^^^C.^^ +2G>^]H0KEC8KHIY+C@:[EOHP* +eofeof example['x0201.jis'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$C02-"(T,C @@ -144,7 +303,6 @@ M/1LH23DZ-EXW7CA>.5XZ7ALD0B1(&RA""ALD0D@^,U$;*$E*7TM?3%]-7TY? M-U\Q7ALH0@H;*$E*7TM?3!LH0@IH86YK86MU(!LH24I?2U],)!LH0@H;*$E* 97TM?3%TA&RA""ALD0D@^,U$D3CAE&RA""@`` eofeof -#` example['x0201.sosi'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA*"ALD0D$T,5$C02-"(T,C @@ -154,7 +312,6 @@ M*$H]#CDZ-EXW7CA>.5XZ7@\;)$(D2!LH2@H;)$)(/C-1&RA*#DI?2U],7TU? M3E\W7S%>#PH.2E]+7TP/&RA*"FAA;FMA:W4@#DI?2U],)`\;*$H*#DI?2U], 672$/&RA*"ALD0D@^,U$D3CAE&RA""@`` eofeof -#" example['x0201.x0208'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$;*$)!0D-$ @@ -164,7 +321,34 @@ M)$)(/C-1)5$E5"57)5HE724M(2PE(B$K&RA""ALD0B51)50E51LH0@IH86YK M86MU(!LD0B51)50E52$B&RA""ALD0B51)50E525S(2,;*$(*&R1"2#XS421. &.&4;*$(* eofeof -#` + +# -X is necessary to allow X0201 in SJIS +# -Z convert X0208 alphabet to ASCII +print "X0201 conversion: SJIS "; + test('-jXZ',example['x0201.sjis'],[example['x0201.x0208']]); +print "X0201 conversion: JIS "; + test('-jZ',example['x0201.jis'],[example['x0201.x0208']]); +print "X0201 conversion:SI/SO "; + test('-jZ',example['x0201.sosi'],[example['x0201.x0208']]); +print "X0201 conversion: EUC "; + test('-jZ',example['x0201.euc'],[example['x0201.x0208']]); +print "X0201 conversion: UTF8 "; + test('-jZ',example['x0201.utf'],[example['x0201.x0208']]); +# -x means X0201 output +print "X0201 output: SJIS "; + test('-xs',example['x0201.euc'],[example['x0201.sjis']]); +print "X0201 output: JIS "; + test('-xj',example['x0201.sjis'],[example['x0201.jis']]); +print "X0201 output: EUC "; + test('-xe',example['x0201.jis'],[example['x0201.euc']]); +print "X0201 output: UTF8 "; + test('-xw',example['x0201.jis'],[example['x0201.utf']]); + +# MIME decode + +print "\nMIME test\n\n"; + +# MIME ISO-2022-JP example['mime.iso2022'] = <<'eofeof'.unpack('u')[0] M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23TI566Q/4U9)1WEH2S\]"CT_ @@ -178,7 +362,6 @@ M96E23U!Y:S=D"FAS;U-G/3T_/2`]/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W M96E23U!Y:S=D:'-O4V<]/3\]"CT_25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I 44D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.ans.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -188,7 +371,6 @@ M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D"FAS;U-G/3T_/2`] M/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D:'-O4V<]/3\]"CT_ L25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I4D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.unbuf.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -206,7 +388,6 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-A<V4* M&R1"-$$[>B1./RD;*$)H<V]39ST]/ST@&R1"-$$[>B1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.unbuf'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -215,21 +396,48 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-A<V4* M&R1"-$$[>B1./RD;*$)H<V]39ST]/ST@&R1"-$$[>B1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.base64'] = <<'eofeof'.unpack('u')[0] M9W-M5"])3&YG<FU#>$I+-&=Q=4,S24LS9W%Q0E%:3TUI-39,,S0Q-&=S5T)1 M43!+9VUA1%9O3T@*9S)+1%1O3'=K8C)1;$E+;V=Q2T-X24MG9W5M0W%*3EEG <<T=#>$E+9V=U;4,X64Q&9W)70S592VMG<6U""F=Q eofeof -#" example['mime.base64.ans'] = <<'eofeof'.unpack('u')[0] M&R1")$M&?B1I)#LD1D0Z)"TD7B0Y)"PA(D5L-7XV83E9)$<A(ALH0@T*&R1" M(T<E-R5G)4,E+R1R0C\_="0J)"0D1B0B)&LD*D4Y)$,D1B0B)&LD<R1')#<D (9R0F)"L;*$(E eofeof -#' + +# print "Next test is expected to Fail.\n"; +print "MIME decode (strict) "; + $tmp = test('-jmS',example['mime.iso2022'],[example['mime.ans.strict']]); + +example['mime.ans.alt'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"<F]K96YC87-E"ALD0C1! +H.WHD3C\I.W8T03MZ)$X_*3MV&RA""ALD0C1!.WHD3B5&)3DE)!LH0@`` +eofeof + +example['mime.unbuf.alt'] = <<'eofeof'.unpack('u')[0] +M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"<F]K96YC87-E"ALD0C1! +H.WHD3C\I.W8T03MZ)$X_*3MV&RA""ALD0C1!.WHD3B5&)3DE)!LH0@`` +eofeof + +print "MIME decode (nonstrict)"; + $tmp = test('-jmN',example['mime.iso2022'],[example['mime.ans'],example['mime.ans.alt']]); + # open(OUT,">tmp1");print OUT pack('u',$tmp);close(OUT); +# unbuf mode implies more pessimistic decode +print "MIME decode (unbuf) "; + $tmp = test('-jmNu',example['mime.iso2022'],[example['mime.unbuf'],example['mime.unbuf.alt']]); + # open(OUT,">tmp2");print OUT pack('u',$tmp);close(OUT); +print "MIME decode (base64) "; + test('-jTmB',example['mime.base64'],[example['mime.base64.ans']]); + +# MIME ISO-8859-1 example['mime.is8859'] = <<'eofeof'.unpack('u')[0] M/3])4T\M.#@U.2TQ/U$_*CU#-V%V83\_/2`*4&5E<B!4]G)N9W)E;@I,87-S @@ -244,75 +452,255 @@ M("!<(")-:6X@:V%E<&AE<W0@:&%R(&9A865T(&5T(&9O96PA(@I!87)H=7,@ M56YI=F5R<VET>2P@1$5.34%22R`@7"`B36EN(&OF<&AE<W0@:&%R(&;E970@ )970@9OAL(2(* eofeof -#" -print 'JIS to JIS ... '; test(' ', example['jis'], example['jis']) -print 'JIS to SJIS... '; test('-s', example['jis'], example['sjis']) -print 'JIS to EUC ... '; test('-e', example['jis'], example['euc']) +# Without -l, ISO-8859-1 was handled as X0201. -print 'SJIS to JIS ... '; test('-j', example['sjis'], example['jis']) -print 'SJIS to SJIS... '; test('-s', example['sjis'], example['sjis']) -print 'SJIS to EUC ... '; test('-e', example['sjis'], example['euc']) +print "MIME ISO-8859-1 (Q) "; + test('-ml',example['mime.is8859'],[example['mime.is8859.ans']]); -print 'EUC to JIS ... '; test(' ', example['euc'], example['jis']) -print 'EUC to SJIS... '; test('-s', example['euc'], example['sjis']) -print 'EUC to EUC ... '; test('-e', example['euc'], example['euc']) +# test for -f is not so simple. +print "\nBug Fixes\n\n"; -# Ambigous Case -print 'Ambiguous Case. '; test('' , example['amb'], example['amb.euc']) +# test_data/cr -# Input assumption -print 'SJIS Input assumption ' -test('-Sx', example['amb'], example['amb.sjis']) +example['test_data/cr'] = <<'eofeof'.unpack('u')[0] +1I,:DN:3(#71E<W0-=&5S=`T` +eofeof -# X0201 仮名 -# X0201->X0208 conversion -# X0208 aphabet -> ASCII -# X0201 相互変換 +example['test_data/cr.ans'] = <<'eofeof'.unpack('u')[0] +7&R1")$8D.21(&RA""G1E<W0*=&5S=`H` +eofeof -print "\nX0201 test\n\n" +print "test_data/cr "; + test('-jd',example['test_data/cr'],[example['test_data/cr.ans']]); +# test_data/fixed-qencode -# -X is necessary to allow X0201 in SJIS -# -Z convert X0208 alphabet to ASCII -print 'X0201 conversion: SJIS ' -test('-XZ', example['x0201.sjis'], example['x0201.x0208']) -print 'X0201 conversion: JIS ' -test('-Z', example['x0201.jis'], example['x0201.x0208']) -print 'X0201 conversion:SI/SO ' -test('-Z', example['x0201.sosi'], example['x0201.x0208']) -print 'X0201 conversion: EUC ' -test('-Z', example['x0201.euc'], example['x0201.x0208']) -# -x means X0201 output -print 'X0201 output: SJIS ' -test('-xs', example['x0201.euc'], example['x0201.sjis']) -print 'X0201 output: JIS ' -test('-xj', example['x0201.sjis'], example['x0201.jis']) -print 'X0201 output: EUC ' -test('-xe', example['x0201.jis'], example['x0201.euc']) +example['test_data/fixed-qencode'] = <<'eofeof'.unpack('u')[0] +M("`@("`@("`],4(D0CYE/STS1#TQ0BA""B`@("`@("`@/3%")$(^93TS1CTS +'1#TQ0BA""@`` +eofeof -# MIME decode +example['test_data/fixed-qencode.ans'] = <<'eofeof'.unpack('u')[0] +F("`@("`@("`;)$(^93\]&RA""B`@("`@("`@&R1"/F4_/1LH0@H` +eofeof -print "\nMIME test\n\n" +print "test_data/fixed-qencode "; + test('-jmQ',example['test_data/fixed-qencode'],[example['test_data/fixed-qencode.ans']]); +# test_data/long-fold-1 -# MIME ISO-2022-JP +example['test_data/long-fold-1'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AHJ2SI.RD +M\J2]I,ZDWJ3>I**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DSR!# +M4B],1B"DSKG4H:,-"J2SI+.DSR!#4B"DSKG4H:,-I+.DLZ3/($Q&+T-2(*3. +9N=2AHPH-"J2SI+.DSR!,1B"DSKG4H:,*"@`` +eofeof -print "Next test is expeced to Fail.\n" +example['test_data/long-fold-1.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD<B0])$XD7B1>)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#<A(B1()$$D920F)$<D021G)',D+B1L)&LD*R1B)#<D<QLH0@H; +M)$(D2B0D(2,;*$(*&R1")#,D,R1/&RA"($-2+TQ&(!LD0B1..50A(QLH0@H; +M)$(D,R0S)$\;*$(@0U(@&R1")$XY5"$C&RA""ALD0B0S)#,D3QLH0B!,1B]# +M4B`;)$(D3CE4(2,;*$(*"ALD0B0S)#,D3QLH0B!,1B`;)$(D3CE4(2,;*$(* +!"@`` +eofeof -print 'MIME decode (strict) ' -tmp = test('-m', example['mime.iso2022'], example['mime.ans.strict']) -print 'MIME decode (nonstrict)' -tmp = test('-m', example['mime.iso2022'], example['mime.ans']) -# open(OUT,'>tmp1');print OUT pack('u',$tmp);close(OUT); -# unbuf mode implies more pessimistic decode -print 'MIME decode (unbuf) ' -test('-mu', example['mime.iso2022'], example['mime.unbuf']) -print 'MIME decode (base64) ' -t = test('-mB', example['mime.base64'], example['mime.base64.ans']) +print "test_data/long-fold-1 "; + test('-jTF60',example['test_data/long-fold-1'],[example['test_data/long-fold-1.ans']]); +# test_data/long-fold -# MIME ISO-8859-1 +example['test_data/long-fold'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AHJ2SI.RD +M\J2]I,ZDWJ3>I**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DS\.[ +'I*2YU*&C"@`` +eofeof -# Without -l, ISO-8859-1 was handled as X0201. +example['test_data/long-fold.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD<B0])$XD7B1>)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#<A(B1()$$D920F)$<D021G)',D+B1L)&LD*R1B)#<D<QLH0@H; +:)$(D2B0D(2,D,R0S)$]#.R0D.50A(QLH0@H` +eofeof + +print "test_data/long-fold "; + test('-jTf60',example['test_data/long-fold'],[example['test_data/long-fold.ans']]); +# test_data/mime_out + +example['test_data/mime_out'] = <<'eofeof'.unpack('u')[0] +M"BTM+2T*4W5B:F5C=#H@86%A82!A86%A(&%A86$@86%A82!A86%A(&%A86$@ +M86%A82!A86%A(&%A86$@86%A82!A86%A(&%A86$@86%A82!A86%A"BTM+2T* +M4W5B:F5C=#H@I**DI*2FI*BDJJ2KI*VDKZ2QI+.DM:2WI+FDNZ2]I+^DP:3$ +MI,:DR*3*I,NDS*3-I,ZDSZ32I-6DV*3;I-ZDWZ3@I.&DXJ3DI*2DYJ2HI.@* +M+2TM+0I3=6)J96-T.B!A86%A(&%A86$@86%A82!A86%A(&%A86$@86%A82!A +I86%A(*2BI*2DIJ2HI*H@86%A82!A86%A(&%A86$@86%A80HM+2TM"@H` +eofeof + +example['test_data/mime_out.ans'] = <<'eofeof'.unpack('u')[0] +M"BTM+2T*4W5B:F5C=#H@86%A82!A86%A(&%A86$@86%A82!A86%A(&%A86$@ +M86%A82!A86%A(&%A86$*(&%A86$@86%A82!A86%A(&%A86$@86%A80HM+2TM +M"E-U8FIE8W0Z(#T_25-/+3(P,C(M2E`_0C]'>5)#2D-):TI#46U*0V=K2VE1 +M<DI#,&M,>5%X2D1-:TY343-*1&MK3WAS;U%G/3T_/2`*"3T_25-/+3(P,C(M +M2E`_0C]'>5)#2D0P:U!Y4D)*15%K4FE224I%;VM3>5)-2D4P:U1I4E!*1DEK +M5E-264=Y:$,_/2`*"3T_25-/+3(P,C(M2E`_0C]'>5)#2D9S:UAI4F9*1T%K +M65-2:4I'46M*0U)M2D-G:V%"<V]19ST]/ST@"BTM+2T*4W5B:F5C=#H@86%A +M82!A86%A(&%A86$@86%A82!A86%A(&%A86$@86%A82`]/TE33RTR,#(R+4I0 +M/T(_1WE20TI#26)+14D]/ST@"@D]/TE33RTR,#(R+4I0/T(_1WE20TI#46M* +J:5%O2D-O8DM%23T_/2`@86%A80H@86%A82!A86%A(&%A86$*+2TM+0H* +eofeof + +print "test_data/mime_out "; + test('-jM',example['test_data/mime_out'],[example['test_data/mime_out.ans']]); +# test_data/multi-line + +example['test_data/multi-line'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AH@"DLZ3L +MI/*DO:3.I-ZDWJ2BI,2DJZ2FI,BAHJ3)I*:DXJ3?I-"DJ*2LI.^DZZ2DI+>A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +example['test_data/multi-line.ans'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AH@"DLZ3L +MI/*DO:3.I-ZDWJ2BI,2DJZ2FI,BAHJ3)I*:DXJ3?I-"DJ*2LI.^DZZ2DI+>A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +print "test_data/multi-line "; + test('-e',example['test_data/multi-line'],[example['test_data/multi-line.ans']]); +# test_data/nkf-19-bug-1 + +example['test_data/nkf-19-bug-1'] = <<'eofeof'.unpack('u')[0] +,I*:DJZ2D"KK8QJ,* +eofeof + +example['test_data/nkf-19-bug-1.ans'] = <<'eofeof'.unpack('u')[0] +8&R1")"8D*R0D&RA""ALD0CI81B,;*$(* +eofeof + +print "test_data/nkf-19-bug-1 "; + test('-Ej',example['test_data/nkf-19-bug-1'],[example['test_data/nkf-19-bug-1.ans']]); +# test_data/nkf-19-bug-2 + +example['test_data/nkf-19-bug-2'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +example['test_data/nkf-19-bug-2.ans'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +print "test_data/nkf-19-bug-2 "; + test('-Ee',example['test_data/nkf-19-bug-2'],[example['test_data/nkf-19-bug-2.ans']]); +# test_data/nkf-19-bug-3 + +example['test_data/nkf-19-bug-3'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['test_data/nkf-19-bug-3.ans'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +print "test_data/nkf-19-bug-3 "; + test('-e',example['test_data/nkf-19-bug-3'],[example['test_data/nkf-19-bug-3.ans']]); +# test_data/non-strict-mime + +example['test_data/non-strict-mime'] = <<'eofeof'.unpack('u')[0] +M/3])4T\M,C`R,BU*4#]"/PIG<U-#;V]+.6=R-D-O;TQ%9W1Y0W0T1D-$46]. +M0V\V16=S,D]N;T999S1Y1%=)3$IG=4-0:UD*2W!G<FU#>$E+:6=R,D-V;TMI +,9W-30V]O3&,*/ST* +eofeof + +example['test_data/non-strict-mime.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$8D)"0_)$`D)"1&)%XD.2$C&RA"#0H-"ALD0CMD)$\[?B$Y)6PE.21+ +<)&(]<20K)#LD1B0D)#\D0"0D)$8D)"1>&RA""@`` +eofeof + +print "test_data/non-strict-mime "; + test('-jTmN',example['test_data/non-strict-mime'],[example['test_data/non-strict-mime.ans']]); +# test_data/q-encode-softrap + +example['test_data/q-encode-softrap'] = <<'eofeof'.unpack('u')[0] +H/3%")$(T03MZ)3T*,R$\)4DD3CTQ0BA""CTQ0B1"2E$T.3TQ0BA""@`` +eofeof + +example['test_data/q-encode-softrap.ans'] = <<'eofeof'.unpack('u')[0] +>&R1"-$$[>B4S(3PE221.&RA""ALD0DI1-#D;*$(* +eofeof + +print "test_data/q-encode-softrap "; + test('-jTmQ',example['test_data/q-encode-softrap'],[example['test_data/q-encode-softrap.ans']]); +# test_data/rot13 + +example['test_data/rot13'] = <<'eofeof'.unpack('u')[0] +MI+.D\Z3+I,&DSZ&BS:W"]*3(I*2DI*3>I+FAHPH*;FMF('9E<BXQ+CDR(*3R +MS?C-T:2UI+NDQJ2DI+^DP*2DI,:DI*3>I+FDK*&B05-#24D@I,O"T*2WI,8@ +M4D]4,3,@I*P*P+6DMZ2OQK"DI*3&I*2DRJ2DI.BDIJ3'H:*PRK*\I,ZDZ*2F +MI,O*T;2YI+6D[*3>I+ND\Z&C"@HE(&5C:&\@)VAO9V4G('P@;FMF("UR"FAO +#9V4* +eofeof + +example['test_data/rot13.ans'] = <<'eofeof'.unpack('u')[0] +M&R1"4V)31%-Z4W!3?E!1?%QQ15-W4U-34U,O4VA04ALH0@H*87AS(&ER92XQ +M+CDR(!LD0E-#?$E\(E-D4VI3=5-34VY3;U-34W534U,O4VA36U!1&RA"3D90 +M5E8@&R1"4WIQ(5-F4W4;*$(@14)',3,@&R1"4UL;*$(*&R1";V139E->=5]3 +M4U-U4U-3>5-34SE355-V4%%?>6%K4WU3.5-54WIY(F-H4V13/5,O4VI31%!2 +A&RA""@HE(')P=6(@)W5B='(G('P@87AS("UE"G5B='(* +eofeof + +print "test_data/rot13 "; + test('-jr',example['test_data/rot13'],[example['test_data/rot13.ans']]); +# test_data/slash + +example['test_data/slash'] = <<'eofeof'.unpack('u')[0] +7("`]/U8\5"U5.5=%2RTK.U<U32LE+PH` +eofeof + +example['test_data/slash.ans'] = <<'eofeof'.unpack('u')[0] +7("`]/U8\5"U5.5=%2RTK.U<U32LE+PH` +eofeof + +print "test_data/slash "; + test(' ',example['test_data/slash'],[example['test_data/slash.ans']]); +# test_data/z1space-0 + +example['test_data/z1space-0'] = <<'eofeof'.unpack('u')[0] +"H:$` +eofeof + +example['test_data/z1space-0.ans'] = <<'eofeof'.unpack('u')[0] +"H:$` +eofeof + +print "test_data/z1space-0 "; + test('-e -Z',example['test_data/z1space-0'],[example['test_data/z1space-0.ans']]); +# test_data/z1space-1 + +example['test_data/z1space-1'] = <<'eofeof'.unpack('u')[0] +"H:$` +eofeof + +example['test_data/z1space-1.ans'] = <<'eofeof'.unpack('u')[0] +!(``` +eofeof + +print "test_data/z1space-1 "; + test('-e -Z1',example['test_data/z1space-1'],[example['test_data/z1space-1.ans']]); +# test_data/z1space-2 + +example['test_data/z1space-2'] = <<'eofeof'.unpack('u')[0] +"H:$` +eofeof + +example['test_data/z1space-2.ans'] = <<'eofeof'.unpack('u')[0] +"("`` +eofeof + +print "test_data/z1space-2 "; + test('-e -Z2',example['test_data/z1space-2'],[example['test_data/z1space-2.ans']]); -print 'MIME ISO-8859-1 (Q) ' -test('-ml', example['mime.is8859'], example['mime.is8859.ans']) +# end |