From 35b917f3ee63fc2251e3d4fc063a48f2d91bb96c Mon Sep 17 00:00:00 2001 From: naruse Date: Fri, 29 Oct 2004 06:51:33 +0000 Subject: follow to nkf 2.0.4 :new constants NKF::VERSION NKF::ASCII NKF::UTF8 NKF::UTF16 NKF::UTF32 :new methods NFK.guess1 (guess) NKF.guess2 (from nkf2) git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7132 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ext/nkf/MANIFEST | 4 +- ext/nkf/lib/kconv.rb | 197 ++++++++++++++++--- ext/nkf/nkf.c | 191 +++++++++++++----- ext/nkf/test.rb | 538 ++++++++++++++++++++++++++++++++++++++++++++------- 4 files changed, 782 insertions(+), 148 deletions(-) (limited to 'ext/nkf') diff --git a/ext/nkf/MANIFEST b/ext/nkf/MANIFEST index 5114a3762a..2a66f44ae0 100644 --- a/ext/nkf/MANIFEST +++ b/ext/nkf/MANIFEST @@ -3,5 +3,7 @@ depend extconf.rb lib/kconv.rb nkf.c -nkf1.7/nkf.c +nkf-utf8/config.h +nkf-utf8/nkf.c +nkf-utf8/utf8tbl.c test.rb diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb index af6d82275f..1fd28a5a59 100644 --- a/ext/nkf/lib/kconv.rb +++ b/ext/nkf/lib/kconv.rb @@ -1,73 +1,226 @@ require 'nkf' module Kconv - AUTO = NKF::AUTO - JIS = NKF::JIS - EUC = NKF::EUC - SJIS = NKF::SJIS - BINARY = NKF::BINARY - NOCONV = NKF::NOCONV - UNKNOWN = NKF::UNKNOWN + #Constant of Encoding + AUTO = ::NKF::AUTO + JIS = ::NKF::JIS + EUC = ::NKF::EUC + SJIS = ::NKF::SJIS + BINARY = ::NKF::BINARY + NOCONV = ::NKF::NOCONV + ASCII = ::NKF::ASCII + UTF8 = ::NKF::UTF8 + UTF16 = ::NKF::UTF16 + UTF32 = ::NKF::UTF32 + UNKNOWN = ::NKF::UNKNOWN + + #Regexp of Encoding + Iconv_Shift_JIS = /\A(?: + [\x00-\x7f\xa1-\xdf] | + \x81[\x40-\x7e\x80-\xac\xb8-\xbf\xc8-\xce\xda-\xe8\xf0-\xf7\xfc] | + \x82[\x4f-\x58\x60-\x79\x81-\x9a\x9f-\xf1] | + \x83[\x40-\x7e\x80-\x96\x9f-\xb6\xbf-\xd6\x40-\x60] | + \x84[\x40-\x60\x70-\x7e\x80-\x91\x9f-\xbe\x9f-\xfc] | + [\x89-\x8f\x90-\x97\x99-\x9f\xe0-\xea][\x40-\x7e] | + [\x89-\x97\x99-\x9f\xe0-\xe9][\x80-\xfc] | + \x98[\x40-\x72\x9f-\xfc] | + \xea[\x80-\xa4] + )*\z/nx + Iconv_EUC_JP = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1\xb0-\xbce\xd0-\xf3][\xa1-\xfe] | + \xa2[\xa1-\xae\xba-\xc1\xca-\xd0\xdc-\xea\xf2-\xf9\xfe] | + \xa3[\xb0-\xb9\xc1-\xda\xe1-\xfa] | + \xa4[\xa1-\xf3] | + \xa5[\xa1-\xf6] | + \xa6[\xa1-\xb8\xc1-\xd8] | + \xa7[\xa1-\xc1\xd1-\xf1] | + \xa8[\xa1-\xc0] | + \xcf[\xa1-\xd3] | + \xf4[\xa1-\xa6] + )*\z/nx + Iconv_UTF8 = /\A(?:\xef\xbb\xbf)?(?: + [\x00-\x7f] | + \xc2[\x80-\x8d\x90-\x9f\xa1\xaa\xac\xae-\xb1\xb4\xb6\xb8\xba\xbf] | + \xc3[\x80-\xbf] | + \xc4[\x80-\x93\x96-\xa2\xa4-\xab\xae-\xbf] | + \xc5[\x80-\x8d\x90-\xbe] | + \xc7[\x8d-\x9c\xb5] | + \xcb[\x87\x98-\x9b\x9d] | + \xce[\x84-\x86\x88-\x8a\x8c\x8e-\xa1\xa3-\xbf] | + \xcf[\x80-\x8e] | + \xd0[\x81-\x8c\x8e-\xbf] | + \xd1[\x80-\x8f\x91-\x9f] | + \xe2\x84[\x83\x96\xa2\xab] | + \xe2\x86[\x83\x91-\x93\x96\xa2\xab] | + \xe2\x87[\x83\x91-\x94\x96\xa2\xab] | + \xe2\x88[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0\xa2\xa7-\xac\xb4-\xb5\xbd] | + \xe2\x89[\x82-\x83\x87-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa6-\xac\xb4-\xb5\xbd] | + \xe2[\x8a\x8c][\x82-\x83\x86-\x88\x8b\x91-\x94\x96\x9a\x9d-\x9e\xa0-\xa2\xa5-\xac\xb4-\xb5\xbd] | + \xe2[\x94-\x99][\x81-\x83\x86-\x88\x8b-\x8c\x8f-\x94\x96-\x98\x9a-\x9e\xa0-\xac\xaf-\xb0\xb3-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3\x80[\x81-\x83\x85-\x98\x9a-\x9e\xa0-\xad\xaf-\xb0\xb2-\xb5\xb7-\xb8\xbb-\xbd\xbf] | + \xe3[\x81-\x83\xb8-\xbf][\x81-\xbf] | + [\xe5-\xe7][\x80-\xbf][\x81-\xbf] | + \xe8[\x80-\xae\xb0-\xbf][\x81-\xbf] | + \xe9[\x80-\x92\x95-\xb1\xb3-\xbe][\x81-\xbf] | + \xef[\xbc-\xbe][\x81-\xbf] | + )*\z/nx + RegexpShiftjis = /\A(?: + [\x00-\x7f\xa1-\xdf] | + [\x81-\x9f\xe0-\xfc][\x40-\x7e\x80-\xfc] + )*\z/nx + RegexpEucjp = /\A(?: + [\x00-\x7f] | + \x8e [\xa1-\xdf] | + \x8f [\xa1-\xdf] [\xa1-\xdf] | + [\xa1-\xdf] [\xa1-\xdf] + )*\z/nx + RegexpUtf8 = /\A(?: + [\x00-\x7f] | + [\xc2-\xdf] [\x80-\xbf] | + \xe0 [\xa0-\xbf] [\x80-\xbf] | + [\xe1-\xef] [\x80-\xbf] [\x80-\xbf] | + \xf0 [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | + [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | + \xf4 [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] + )*\z/nx + + # + # kconv + # + def kconv(str, out_code, in_code = AUTO) opt = '-' case in_code - when NKF::JIS + when ::NKF::JIS opt << 'J' - when NKF::EUC + when ::NKF::EUC opt << 'E' - when NKF::SJIS + when ::NKF::SJIS opt << 'S' + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'W' end case out_code - when NKF::JIS + when ::NKF::JIS opt << 'j' - when NKF::EUC + when ::NKF::EUC opt << 'e' - when NKF::SJIS + when ::NKF::SJIS opt << 's' - when NKF::NOCONV + when ::NKF::UTF8 + when ::NKF::UTF16 + opt << 'w' + when ::NKF::NOCONV return str end opt = '' if opt == '-' - NKF::nkf(opt, str) + ::NKF::nkf(opt, str) end module_function :kconv + # + # Encode to + # + def tojis(str) - NKF::nkf('-j', str) + ::NKF::nkf('-j', str) end module_function :tojis def toeuc(str) - NKF::nkf('-e', str) + ::NKF::nkf('-e', str) end module_function :toeuc def tosjis(str) - NKF::nkf('-s', str) + ::NKF::nkf('-s', str) end module_function :tosjis + def toutf8(str) + ::NKF::nkf('-w', str) + end + module_function :toutf8 + + def toutf16(str) + ::NKF::nkf('-w16', str) + end + module_function :toutf16 + + # + # guess + # + def guess(str) - NKF::guess(str) + ::NKF::guess(str) end module_function :guess + + def guess_old(str) + ::NKF::guess_old(str) + end + module_function :guess_old + + # + # isEncoding + # + + def iseuc(str) + RegexpEucjp.match( str ) + end + module_function :iseuc + + def issjis(str) + RegexpShiftjis.match( str ) + end + module_function :issjis + + def isutf8(str) + RegexpUtf8.match( str ) + end + module_function :isutf8 + end class String def kconv(out_code, in_code=Kconv::AUTO) Kconv::kconv(self, out_code, in_code) end + + # to Encoding def tojis - NKF::nkf('-j', self) + ::NKF::nkf('-j', self) end def toeuc - NKF::nkf('-e', self) + ::NKF::nkf('-e', self) end def tosjis - NKF::nkf('-s', self) + ::NKF::nkf('-s', self) + end + def toutf8 + ::NKF::nkf('-w', self) + end + def toutf16 + ::NKF::nkf('-w16', self) + end + + # is Encoding + def iseuc + Kconv.iseuc( self ) + end + + def issjis + Kconv.issjis( self ) + end + + def isutf8 + Kconv.isutf8( self ) end end diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c index ca6de73e10..6517b3aba1 100644 --- a/ext/nkf/nkf.c +++ b/ext/nkf/nkf.c @@ -1,51 +1,82 @@ +/* + * NKF Module for Ruby base on nkf 2.x + * + * original nkf2.0 is maintained at http://sourceforge.jp/projects/nkf/ + * + */ + +static char *RVersion = "2.0.4.1r1"; + #include "ruby.h" +/* Encoding Constants */ #define _AUTO 0 #define _JIS 1 #define _EUC 2 #define _SJIS 3 #define _BINARY 4 #define _NOCONV 4 +#define _ASCII 5 +/* 0b011x is reserved for UTF-8 Family */ +#define _UTF8 6 +/* 0b10xx is reserved for UTF-16 Family */ +#define _UTF16 8 +/* 0b11xx is reserved for UTF-32 Family */ +#define _UTF32 12 +#define _OTHER 16 #define _UNKNOWN _AUTO +/* Replace nkf's getchar/putchar for variable modification */ +/* we never use getc, ungetc */ + #undef getc #undef ungetc -#define getc(f) (input_ctr=i_len?-1:input[input_ctr++]) +#define ungetc(c,f) input_ctr-- +#define INCSIZE 32 #undef putchar -#define putchar(c) rb_nkf_putchar(c) +#undef TRUE +#undef FALSE +#define putchar(c) rb_nkf_putchar(c) -#define INCSIZE 32 -static int incsize; +/* Input/Output pointers */ -static unsigned char *input, *output; -static int input_ctr, i_len; -static int output_ctr, o_len; +static unsigned char *output; +static unsigned char *input; +static int input_ctr; +static int i_len; +static int output_ctr; +static int o_len; +static int incsize; -static VALUE dst; +static VALUE result; static int rb_nkf_putchar(c) - unsigned int c; + unsigned int c; { if (output_ctr >= o_len) { o_len += incsize; - rb_str_resize(dst, o_len); - output = RSTRING(dst)->ptr; + rb_str_resize(result, o_len); incsize *= 2; + output = RSTRING(result)->ptr; } output[output_ctr++] = c; return c; } +/* Include kanji filter main part */ +/* getchar and putchar will be replaced during inclusion */ + #define PERL_XS 1 -#include "nkf1.7/nkf.c" +#include "nkf-utf8/utf8tbl.c" +#include "nkf-utf8/nkf.c" static VALUE rb_nkf_kconv(obj, opt, src) - VALUE obj, opt, src; + VALUE obj, opt, src; { char *opt_ptr, *opt_end; volatile VALUE v; @@ -58,44 +89,46 @@ rb_nkf_kconv(obj, opt, src) if (*opt_ptr != '-') { continue; } - arguments(opt_ptr); + options(opt_ptr); } incsize = INCSIZE; - input_ctr = 0; + input_ctr = 0; StringValue(src); input = RSTRING(src)->ptr; i_len = RSTRING(src)->len; - dst = rb_str_new(0, i_len*3 + 10); - v = dst; + result = rb_str_new(0, i_len*3 + 10); + v = result; output_ctr = 0; - output = RSTRING(dst)->ptr; - o_len = RSTRING(dst)->len; + output = RSTRING(result)->ptr; + o_len = RSTRING(result)->len; *output = '\0'; - if(iso8859_f && (oconv != j_oconv || !x0201_f )) { - iso8859_f = FALSE; - } + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); kanji_convert(NULL); - RSTRING(dst)->ptr[output_ctr] = '\0'; - RSTRING(dst)->len = output_ctr; - OBJ_INFECT(dst, src); + RSTRING(result)->ptr[output_ctr] = '\0'; + RSTRING(result)->len = output_ctr; + OBJ_INFECT(result, src); - return dst; + return result; } + /* + * NKF.guess1 + * * Character code detection - Algorithm described in: * Ken Lunde. `Understanding Japanese Information Processing' * Sebastopol, CA: O'Reilly & Associates. */ static VALUE -rb_nkf_guess(obj, src) - VALUE obj, src; +rb_nkf_guess1(obj, src) + VALUE obj, src; { unsigned char *p; unsigned char *pend; @@ -107,16 +140,16 @@ rb_nkf_guess(obj, src) if (p == pend) return INT2FIX(_UNKNOWN); #define INCR do {\ - p++;\ - if (p==pend) return INT2FIX(_UNKNOWN);\ - sequence_counter++;\ - if (sequence_counter % 2 == 1 && *p != 0xa4)\ + p++;\ + if (p==pend) return INT2FIX(_UNKNOWN);\ + sequence_counter++;\ + if (sequence_counter % 2 == 1 && *p != 0xa4)\ sequence_counter = 0;\ - if (6 <= sequence_counter) {\ - sequence_counter = 0;\ - return INT2FIX(_EUC);\ - }\ -} while (0) + if (6 <= sequence_counter) {\ + sequence_counter = 0;\ + return INT2FIX(_EUC);\ + }\ + } while (0) if (*p == 0xa4) sequence_counter = 1; @@ -180,19 +213,77 @@ rb_nkf_guess(obj, src) return INT2FIX(_UNKNOWN); } + +/* + * NKF.guess2 + * + * Guess Encoding By NKF2.0 Routine + */ + +static VALUE +rb_nkf_guess2(obj, src) + VALUE obj, src; +{ + int code = _BINARY; + + reinit(); + + input_ctr = 0; + StringValue(src); + input = RSTRING(src)->ptr; + i_len = RSTRING(src)->len; + + if(x0201_f == WISH_TRUE) + x0201_f = ((!iso2022jp_f)? TRUE : NO_X0201); + + guess_f = TRUE; + kanji_convert( NULL ); + guess_f = FALSE; + + if (!is_inputcode_mixed) { + if (strcmp(input_codename, "") == 0) { + code = _ASCII; + } else if (strcmp(input_codename, "ISO-2022-JP") == 0) { + code = _JIS; + } else if (strcmp(input_codename, "EUC-JP") == 0) { + code = _EUC; + } else if (strcmp(input_codename, "Shift_JIS") == 0) { + code = _SJIS; + } else if (strcmp(input_codename, "UTF-8") == 0) { + code = _UTF8; + } else if (strcmp(input_codename, "UTF-16") == 0) { + code = _UTF16; + } else if (strlen(input_codename) > 0) { + code = _UNKNOWN; + } + } + + return INT2FIX( code ); +} + + +/* Initialize NKF Module */ + void Init_nkf() { - VALUE mKconv = rb_define_module("NKF"); - - rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); - rb_define_module_function(mKconv, "guess", rb_nkf_guess, 1); - - rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); - rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); - rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); - rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); - rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); - rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); - rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + VALUE mKconv = rb_define_module("NKF"); + + rb_define_module_function(mKconv, "nkf", rb_nkf_kconv, 2); + rb_define_module_function(mKconv, "guess", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess1", rb_nkf_guess1, 1); + rb_define_module_function(mKconv, "guess2", rb_nkf_guess2, 1); + + rb_define_const(mKconv, "AUTO", INT2FIX(_AUTO)); + rb_define_const(mKconv, "JIS", INT2FIX(_JIS)); + rb_define_const(mKconv, "EUC", INT2FIX(_EUC)); + rb_define_const(mKconv, "SJIS", INT2FIX(_SJIS)); + rb_define_const(mKconv, "BINARY", INT2FIX(_BINARY)); + rb_define_const(mKconv, "NOCONV", INT2FIX(_NOCONV)); + rb_define_const(mKconv, "ASCII", INT2FIX(_ASCII)); + rb_define_const(mKconv, "UTF8", INT2FIX(_UTF8)); + rb_define_const(mKconv, "UTF16", INT2FIX(_UTF16)); + rb_define_const(mKconv, "UTF32", INT2FIX(_UTF32)); + rb_define_const(mKconv, "UNKNOWN", INT2FIX(_UNKNOWN)); + rb_define_const(mKconv, "VERSION", rb_str_new2(RVersion)); } diff --git a/ext/nkf/test.rb b/ext/nkf/test.rb index 4519f8ba7e..7a2390d649 100644 --- a/ext/nkf/test.rb +++ b/ext/nkf/test.rb @@ -1,3 +1,19 @@ +#!/usr/local/bin/ruby +# +# nkf test program for nkf 1.7 +# Shinji KONO +# Sun Aug 18 12:25:40 JST 1996 +# Sun Nov 8 00:16:06 JST 1998 +# +# This is useful when you add new patch on nkf. +# Since this test is too strict, faileurs may not mean +# wrong conversion. +# +# nkf 1.5 differs on MIME decoding +# nkf 1.4 passes Basic Conversion tests +# nkf PDS version passes Basic Conversion tests using "nkf -iB -oB " +# + $counter = 0 def result(result, message = nil) $counter += 1 @@ -49,41 +65,150 @@ end $detail = false -def test(opt, input, expect) +def test(opt, input, expects) print "\nINPUT:\n", input if $detail - print "\nEXPECT:\n", expect if $detail + print "\nEXPECT:\n", expects.to_s if $detail result = nkf(opt, input) print "\nGOT:\n", result if $detail - print result == expect ? "Ok\n" : "Fail\n" - return result + expects.each do |e| + if result == e then + puts "Ok" + return result + end + end + puts "Fail" end + +example = Hash.new + # Basic Conversion -print "\nBasic Conversion test\n\n" +print "\nBasic Conversion test\n\n"; + +# I gave up simple literal quote because there are big difference +# on perl4 and perl5 on literal quote. Of course we cannot use +# jperl. -example = {} example['jis'] = <<'eofeof'.unpack('u')[0] M1FERED"6GIAR(%-E8V]N9"!3=&%G92"8I9=Y($AI M#28./ >@Y*#DR!+:6=O=2"!18&'@D^"8(._@]:$081@A+X* eofeof -#' example['euc'] = <<'eofeof'.unpack('u')[0] M1FERI?*E\R!+:6=O=2"AIJ'GH["CP:;!IMBGHJ?!J,`* eofeof -#' + +example['utf'] = <<'eofeof'.unpack('u')[0] +M1FERL)C^.7S)AJ"0D* +eofeof + +example['euc1'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['utf1'] = <<'eofeof'.unpack('u')[0] +AZ+J%Z:N/Z8JM"N>VNNFZEPKIM(D)Y+B*Z:"8Y+J8"0D* +eofeof + +example['jis2'] = <<'eofeof'.unpack('u')[0] ++&R1".EA&(QLH0@H` +eofeof + +example['sjis2'] = <<'eofeof'.unpack('u')[0] +%C=:3H0H` +eofeof + +example['euc2'] = <<'eofeof'.unpack('u')[0] +%NMC&HPH` +eofeof + +example['utf2'] = <<'eofeof'.unpack('u')[0] +'YI:.Z)>D"@`` +eofeof + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis'],[example['jis']]); +print "JIS to SJIS... ";test('-s',example['jis'],[example['sjis']]); +print "JIS to EUC ... ";test('-e',example['jis'],[example['euc']]); +print "JIS to UTF8... ";test('-w',example['jis'],[example['utf']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis'],[example['jis']]); +print "SJIS to SJIS... ";test('-s',example['sjis'],[example['sjis']]); +print "SJIS to EUC ... ";test('-e',example['sjis'],[example['euc']]); +print "SJIS to UTF8... ";test('-w',example['sjis'],[example['utf']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc'],[example['jis']]); +print "EUC to SJIS... ";test('-s',example['euc'],[example['sjis']]); +print "EUC to EUC ... ";test('-e',example['euc'],[example['euc']]); +print "EUC to UTF8... ";test('-w',example['euc'],[example['utf']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf'],[example['jis']]); +print "UTF8 to SJIS... ";test('-s',example['utf'],[example['sjis']]); +print "UTF8 to EUC ... ";test('-e',example['utf'],[example['euc']]); +print "UTF8 to UTF8... ";test('-w',example['utf'],[example['utf']]); + + + +# From JIS + +print "JIS to JIS ... ";test('-j',example['jis1'],[example['jis1']]); +print "JIS to SJIS... ";test('-s',example['jis1'],[example['sjis1']]); +print "JIS to EUC ... ";test('-e',example['jis1'],[example['euc1']]); +print "JIS to UTF8... ";test('-w',example['jis1'],[example['utf1']]); + +# From SJIS + +print "SJIS to JIS ... ";test('-j',example['sjis1'],[example['jis1']]); +print "SJIS to SJIS... ";test('-s',example['sjis1'],[example['sjis1']]); +print "SJIS to EUC ... ";test('-e',example['sjis1'],[example['euc1']]); +print "SJIS to UTF8... ";test('-w',example['sjis1'],[example['utf1']]); + +# From EUC + +print "EUC to JIS ... ";test('-j',example['euc1'],[example['jis1']]); +print "EUC to SJIS... ";test('-s',example['euc1'],[example['sjis1']]); +print "EUC to EUC ... ";test('-e',example['euc1'],[example['euc1']]); +print "EUC to UTF8... ";test('-w',example['euc1'],[example['utf1']]); + +# From UTF8 + +print "UTF8 to JIS ... ";test('-j',example['utf1'],[example['jis1']]); +print "UTF8 to SJIS... ";test('-s',example['utf1'],[example['sjis1']]); +print "UTF8 to EUC ... ";test('-e',example['utf1'],[example['euc1']]); +print "UTF8 to UTF8... ";test('-w',example['utf1'],[example['utf1']]); + +# Ambigous Case example['amb'] = <<'eofeof'.unpack('u')[0] MI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&EPK"QI<*PL:7"L+&E @@ -117,6 +242,31 @@ M)4(;*$(*&RA))4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q >)4(P,25",#$E0C`Q)4(P,25",#$E0C`Q)4(;*$(* eofeof +print "Ambiguous Case. "; + test('-j',example['amb'],[example['amb.euc']]); + +# Input assumption + +print "SJIS Input assumption "; + test('-jSx',example['amb'],[example['amb.sjis']]); + +# Broken JIS + +print "Broken JIS "; + $input = example['jis']; + $input.gsub("\033",''); + test('-Be',$input,[example['euc']]); +print "Broken JIS is safe on Normal JIS? "; + $input = example['jis']; + test('-Be',$input,[example['euc']]); + +# X0201 仮名 +# X0201->X0208 conversion +# X0208 aphabet -> ASCII +# X0201 相互変換 + +print "\nX0201 test\n\n"; + example['x0201.sjis'] = <<'eofeof'.unpack('u')[0] MD5.*<(-*@TR#3H-0@U*#2X--@T^#48-3"I%3B7""8()A@F*"8X)D@F6"9H*! M@H*"@X*$@H6"AH*'"I%3BTR-AH%)@9>!E(&0@9.!3X&5@9:!:8%J@7R!>X&! @@ -124,7 +274,6 @@ M@6V!;H%O@7"!CPJ4O(IPMK>X/;FZMMZWWKC>N=ZZWH+&"I2\BG#*W\O?S-_- MW\[?M]^QW@K*W\O?S`IH86YK86MU(,K?R]_,I`K*W\O?S-VA"I2\BG""S(SC !"@!" eofeof -#' example['x0201.euc'] = <<'eofeof'.unpack('u')[0] MP;2ST:6KI:VEKZ6QI;.EK*6NI;"ELJ6T"L&TL=&CP:/"H\.CQ*/%H\:CQZ/A @@ -134,7 +283,17 @@ MWJ3("LB^L]&.RH[?CLN.WX[,CM^.S8[?CLZ.WXZWCM^.L8[>"H[*CM^.RX[? MCLP*:&%N:V%K=2".RH[?CLN.WX[,CJ0*CLJ.WX[+CM^.S([=CJ$*R+ZST:3. #N.4* eofeof -#' + +example['x0201.utf'] = <<'eofeof'.unpack('u')[0] +MY86HZ*>2XX*KXX*MXX*OXX*QXX*SXX*LXX*NXX*PXX*RXX*T"N6%J.B+L>^\ +MH>^\HN^\H^^\I.^\I>^\IN^\I^^]@>^]@N^]@^^]A.^]A>^]AN^]APKEA:CH +MJ)CEC[?OO('OO*#OO(/OO(3OO(7OO+[OO(;OO(KOO(COO(GBB)+OO(OOO)WO +MO+OOO+WOO9OOO9WOOZ4*Y8V*Z*>2[[VV[[VW[[VX/>^]N>^]NN^]MN^^GN^] +MM^^^GN^]N.^^GN^]N>^^GN^]NN^^GN.!J`KEC8KHIY+OOHKOOI_OOHOOOI_O +MOHSOOI_OOHWOOI_OOH[OOI_OO;?OOI_OO;'OOIX*[[Z*[[Z?[[Z+[[Z?[[Z, +M"FAA;FMA:W4@[[Z*[[Z?[[Z+[[Z?[[Z,[[VD"N^^BN^^G^^^B^^^G^^^C.^^ +2G>^]H0KEC8KHIY+C@:[EOHP* +eofeof example['x0201.jis'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$C02-"(T,C @@ -144,7 +303,6 @@ M/1LH23DZ-EXW7CA>.5XZ7ALD0B1(&RA""ALD0D@^,U$;*$E*7TM?3%]-7TY? M-U\Q7ALH0@H;*$E*7TM?3!LH0@IH86YK86MU(!LH24I?2U],)!LH0@H;*$E* 97TM?3%TA&RA""ALD0D@^,U$D3CAE&RA""@`` eofeof -#` example['x0201.sosi'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA*"ALD0D$T,5$C02-"(T,C @@ -154,7 +312,6 @@ M*$H]#CDZ-EXW7CA>.5XZ7@\;)$(D2!LH2@H;)$)(/C-1&RA*#DI?2U],7TU? M3E\W7S%>#PH.2E]+7TP/&RA*"FAA;FMA:W4@#DI?2U],)`\;*$H*#DI?2U], 672$/&RA*"ALD0D@^,U$D3CAE&RA""@`` eofeof -#" example['x0201.x0208'] = <<'eofeof'.unpack('u')[0] M&R1"030S424K)2TE+R4Q)3,E+"4N)3`E,B4T&RA""ALD0D$T,5$;*$)!0D-$ @@ -164,7 +321,34 @@ M)$)(/C-1)5$E5"57)5HE724M(2PE(B$K&RA""ALD0B51)50E51LH0@IH86YK M86MU(!LD0B51)50E52$B&RA""ALD0B51)50E525S(2,;*$(*&R1"2#XS421. &.&4;*$(* eofeof -#` + +# -X is necessary to allow X0201 in SJIS +# -Z convert X0208 alphabet to ASCII +print "X0201 conversion: SJIS "; + test('-jXZ',example['x0201.sjis'],[example['x0201.x0208']]); +print "X0201 conversion: JIS "; + test('-jZ',example['x0201.jis'],[example['x0201.x0208']]); +print "X0201 conversion:SI/SO "; + test('-jZ',example['x0201.sosi'],[example['x0201.x0208']]); +print "X0201 conversion: EUC "; + test('-jZ',example['x0201.euc'],[example['x0201.x0208']]); +print "X0201 conversion: UTF8 "; + test('-jZ',example['x0201.utf'],[example['x0201.x0208']]); +# -x means X0201 output +print "X0201 output: SJIS "; + test('-xs',example['x0201.euc'],[example['x0201.sjis']]); +print "X0201 output: JIS "; + test('-xj',example['x0201.sjis'],[example['x0201.jis']]); +print "X0201 output: EUC "; + test('-xe',example['x0201.jis'],[example['x0201.euc']]); +print "X0201 output: UTF8 "; + test('-xw',example['x0201.jis'],[example['x0201.utf']]); + +# MIME decode + +print "\nMIME test\n\n"; + +# MIME ISO-2022-JP example['mime.iso2022'] = <<'eofeof'.unpack('u')[0] M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23TI566Q/4U9)1WEH2S\]"CT_ @@ -178,7 +362,6 @@ M96E23U!Y:S=D"FAS;U-G/3T_/2`]/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W M96E23U!Y:S=D:'-O4V<]/3\]"CT_25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I 44D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.ans.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -188,7 +371,6 @@ M/3])4T\M,C`R,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D"FAS;U-G/3T_/2`] M/TE33RTR,`HR,BU*4#]"/T=Y4D%.144W96E23U!Y:S=D:'-O4V<]/3\]"CT_ L25-/+3(P,C(M2E`_0C]'>5)!3D5%-V5I4D]*55EL3QM;2U-624=Y:$L_/0H_ eofeof -#' example['mime.unbuf.strict'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -206,7 +388,6 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.unbuf'] = <<'eofeof'.unpack('u')[0] M&R1"-$$[>B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 @@ -215,21 +396,48 @@ M(&QI;F4*&R1"-$$[>B1./RD[=C1!.WHD3C\I.W8;*$(*0G)O:V5N(&-AB1./RD;*$)HB1./RD[=ALH0@H;)$(T 603MZ)$XE1ALH0EM+4U9)1WEH2S\]"@`* eofeof -#" example['mime.base64'] = <<'eofeof'.unpack('u')[0] M9W-M5"])3&YG$I+-&=Q=4,S24LS9W%Q0E%:3TUI-39,,S0Q-&=S5T)1 M43!+9VUA1%9O3T@*9S)+1%1O3'=K8C)1;$E+;V=Q2T-X24MG9W5M0W%*3EEG <$E+9V=U;4,X64Q&9W)70S592VMG<6U""F=Q eofeof -#" example['mime.base64.ans'] = <<'eofeof'.unpack('u')[0] M&R1")$M&?B1I)#LD1D0Z)"TD7B0Y)"PA(D5L-7XV83E9)$B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"B1.)48E.25(&RA""ALD0C1!.WHD3B5&)3DE2!LH0@H;)$(D1B11 +M&RA"96YD"ALD0B0])"8D*R1*&RA"&R1"-$$[>B1./RD[=ALH0F5N9&]F;&EN +M90H;)$(T03MZ)$X_*3MV-$$[>B1./RD[=ALH0@I"tmp1");print OUT pack('u',$tmp);close(OUT); +# unbuf mode implies more pessimistic decode +print "MIME decode (unbuf) "; + $tmp = test('-jmNu',example['mime.iso2022'],[example['mime.unbuf'],example['mime.unbuf.alt']]); + # open(OUT,">tmp2");print OUT pack('u',$tmp);close(OUT); +print "MIME decode (base64) "; + test('-jTmB',example['mime.base64'],[example['mime.base64.ans']]); + +# MIME ISO-8859-1 example['mime.is8859'] = <<'eofeof'.unpack('u')[0] M/3])4T\M.#@U.2TQ/U$_*CU#-V%V83\_/2`*4&5E2P@1$5.34%22R`@7"`B36EN(&OF<&AEX0208 conversion -# X0208 aphabet -> ASCII -# X0201 相互変換 +example['test_data/cr.ans'] = <<'eofeof'.unpack('u')[0] +7&R1")$8D.21(&RA""G1EI**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DSR!# +M4B],1B"DSKG4H:,-"J2SI+.DSR!#4B"DSKG4H:,-I+.DLZ3/($Q&+T-2(*3. +9N=2AHPH-"J2SI+.DSR!,1B"DSKG4H:,*"@`` +eofeof -print "Next test is expeced to Fail.\n" +example['test_data/long-fold-1.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#tmp1');print OUT pack('u',$tmp);close(OUT); -# unbuf mode implies more pessimistic decode -print 'MIME decode (unbuf) ' -test('-mu', example['mime.iso2022'], example['mime.unbuf']) -print 'MIME decode (base64) ' -t = test('-mB', example['mime.base64'], example['mime.base64.ans']) +print "test_data/long-fold-1 "; + test('-jTF60',example['test_data/long-fold-1'],[example['test_data/long-fold-1.ans']]); +# test_data/long-fold -# MIME ISO-8859-1 +example['test_data/long-fold'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AHJ2SI.RD +M\J2]I,ZDWJ3>I**DQ*2KI*:DR*&BI,FDIJ3BI-^DT*2HI*RD[Z3KI*2DMZ&B +MI,BDP:3EI*:DQZ3!I.>D\Z2NI.RDZZ2KI.*DMZ3SI,JDI*&C"J2SI+.DS\.[ +'I*2YU*&C"@`` +eofeof -# Without -l, ISO-8859-1 was handled as X0201. +example['test_data/long-fold.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$HD+"0D)$HD+"0D)$HD+"%!)"0D+B1G)"8D+"0B)&HD7B0W)$8A(B0S +M)&PD)"(D1"0K&RA""ALD0B0F)$@A(B1))"8D8B1?)%`D*"0L +M)&\D:R0D)#5)#2D-):TI#46U*0V=K2VE1 +M5%X2D1-:TY343-*1&MK3WAS;U%G/3T_/2`*"3T_25-/+3(P,C(M +M2E`_0C]'>5)#2D0P:U!Y4D)*15%K4FE224I%;VM3>5)-2D4P:U1I4E!*1DEK +M5E-264=Y:$,_/2`*"3T_25-/+3(P,C(M2E`_0C]'>5)#2D9S:UAI4F9*1T%K +M65-2:4I'46M*0U)M2D-G:V%"A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +example['test_data/multi-line.ans'] = <<'eofeof'.unpack('u')[0] +MI,JDK*2DI,JDK*2DI,JDK*'!I*2DKJ3GI*:DK*2BI.JDWJ2WI,:AH@"DLZ3L +MI/*DO:3.I-ZDWJ2BI,2DJZ2FI,BAHJ3)I*:DXJ3?I-"DJ*2LI.^DZZ2DI+>A +MHJ3(I,&DY:2FI,>DP:3GI/.DKJ3LI.NDJZ3BI+>D\Z3*I*2AHPJDLZ2SI,_# +8NZ2DN=2AHP`*I+.DLZ3/P[NDI+G4H:,* +eofeof + +print "test_data/multi-line "; + test('-e',example['test_data/multi-line'],[example['test_data/multi-line.ans']]); +# test_data/nkf-19-bug-1 + +example['test_data/nkf-19-bug-1'] = <<'eofeof'.unpack('u')[0] +,I*:DJZ2D"KK8QJ,* +eofeof + +example['test_data/nkf-19-bug-1.ans'] = <<'eofeof'.unpack('u')[0] +8&R1")"8D*R0D&RA""ALD0CI81B,;*$(* +eofeof + +print "test_data/nkf-19-bug-1 "; + test('-Ej',example['test_data/nkf-19-bug-1'],[example['test_data/nkf-19-bug-1.ans']]); +# test_data/nkf-19-bug-2 + +example['test_data/nkf-19-bug-2'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +example['test_data/nkf-19-bug-2.ans'] = <<'eofeof'.unpack('u')[0] +%I-NDL@H` +eofeof + +print "test_data/nkf-19-bug-2 "; + test('-Ee',example['test_data/nkf-19-bug-2'],[example['test_data/nkf-19-bug-2.ans']]); +# test_data/nkf-19-bug-3 + +example['test_data/nkf-19-bug-3'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +example['test_data/nkf-19-bug-3.ans'] = <<'eofeof'.unpack('u')[0] +8[;'Q\,&L"N6ZSN\*\NT)ON7.SL_+"0D* +eofeof + +print "test_data/nkf-19-bug-3 "; + test('-e',example['test_data/nkf-19-bug-3'],[example['test_data/nkf-19-bug-3.ans']]); +# test_data/non-strict-mime + +example['test_data/non-strict-mime'] = <<'eofeof'.unpack('u')[0] +M/3])4T\M,C`R,BU*4#]"/PIG$E+:6=R,D-V;TMI +,9W-30V]O3&,*/ST* +eofeof + +example['test_data/non-strict-mime.ans'] = <<'eofeof'.unpack('u')[0] +M&R1")$8D)"0_)$`D)"1&)%XD.2$C&RA"#0H-"ALD0CMD)$\[?B$Y)6PE.21+ +<)&(]<20K)#LD1B0D)#\D0"0D)$8D)"1>&RA""@`` +eofeof + +print "test_data/non-strict-mime "; + test('-jTmN',example['test_data/non-strict-mime'],[example['test_data/non-strict-mime.ans']]); +# test_data/q-encode-softrap + +example['test_data/q-encode-softrap'] = <<'eofeof'.unpack('u')[0] +H/3%")$(T03MZ)3T*,R$\)4DD3CTQ0BA""CTQ0B1"2E$T.3TQ0BA""@`` +eofeof + +example['test_data/q-encode-softrap.ans'] = <<'eofeof'.unpack('u')[0] +>&R1"-$$[>B4S(3PE221.&RA""ALD0DI1-#D;*$(* +eofeof + +print "test_data/q-encode-softrap "; + test('-jTmQ',example['test_data/q-encode-softrap'],[example['test_data/q-encode-softrap.ans']]); +# test_data/rot13 + +example['test_data/rot13'] = <<'eofeof'.unpack('u')[0] +MI+.D\Z3+I,&DSZ&BS:W"]*3(I*2DI*3>I+FAHPH*;FMF('9EI+FDK*&B05-#24D@I,O"T*2WI,8@ +M4D]4,3,@I*P*P+6DMZ2OQK"DI*3&I*2DRJ2DI.BDIJ3'H:*PRK*\I,ZDZ*2F +MI,O*T;2YI+6D[*3>I+ND\Z&C"@HE(&5C:&\@)VAO9V4G('P@;FMF("UR"FAO +#9V4* +eofeof + +example['test_data/rot13.ans'] = <<'eofeof'.unpack('u')[0] +M&R1"4V)31%-Z4W!3?E!1?%QQ15-W4U-34U,O4VA04ALH0@H*87AS(&ER92XQ +M+CDR(!LD0E-#?$E\(E-D4VI3=5-34VY3;U-34W534U,O4VA36U!1&RA"3D90 +M5E8@&R1"4WIQ(5-F4W4;*$(@14)',3,@&R1"4UL;*$(*&R1";V139E->=5]3 +M4U-U4U-3>5-34SE355-V4%%?>6%K4WU3.5-54WIY(F-H4V13/5,O4VI31%!2 +A&RA""@HE(')P=6(@)W5B='(G('P@87AS("UE"G5B='(* +eofeof + +print "test_data/rot13 "; + test('-jr',example['test_data/rot13'],[example['test_data/rot13.ans']]); +# test_data/slash + +example['test_data/slash'] = <<'eofeof'.unpack('u')[0] +7("`]/U8\5"U5.5=%2RTK.U