diff options
Diffstat (limited to 'ext/nkf/nkf-utf8/nkf.c')
-rw-r--r-- | ext/nkf/nkf-utf8/nkf.c | 148 |
1 files changed, 135 insertions, 13 deletions
diff --git a/ext/nkf/nkf-utf8/nkf.c b/ext/nkf/nkf-utf8/nkf.c index 9fd1436239..9b6fffadc4 100644 --- a/ext/nkf/nkf-utf8/nkf.c +++ b/ext/nkf/nkf-utf8/nkf.c @@ -41,7 +41,7 @@ ***********************************************************************/ /* $Id$ */ #define NKF_VERSION "2.0.5" -#define NKF_RELEASE_DATE "2005-06-28" +#define NKF_RELEASE_DATE "2005-07-05" #include "config.h" static char *CopyRight = @@ -105,6 +105,11 @@ static char *CopyRight = #ifdef PERL_XS #undef OVERWRITE #endif +#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE ) +#define UNICODE_ENABLE +#else +#undef UNICODE_NORMALIZATION +#endif #ifndef PERL_XS #include <stdio.h> @@ -246,7 +251,7 @@ static char *CopyRight = #define GETA2 0x2e -#if defined( UTF8_OUTPUT_ENABLE ) || defined( UTF8_INPUT_ENABLE ) +#ifdef UNICODE_ENABLE #define sizeof_euc_utf8 94 #define sizeof_euc_to_utf8_1byte 94 #define sizeof_euc_to_utf8_2bytes 94 @@ -389,17 +394,21 @@ static int x0201_f = TRUE; /* Assume JISX0201 kana */ static int x0201_f = NO_X0201; /* Assume NO JISX0201 */ #endif static int iso2022jp_f = FALSE; /* convert ISO-2022-JP */ +#ifdef UNICODE_ENABLE +static int internal_unicode_f = FALSE; /* Internal Unicode Processing */ +#endif #ifdef UTF8_OUTPUT_ENABLE static int unicode_bom_f= 0; /* Output Unicode BOM */ static int w_oconv16_LE = 0; /* utf-16 little endian */ static int ms_ucs_map_f = FALSE; /* Microsoft UCS Mapping Compatible */ #endif - -#ifdef NUMCHAR_OPTION - -#define CLASS_MASK 0x0f000000 -#define CLASS_UTF16 0x01000000 +#ifdef UNICODE_NORMALIZATION +static int nfc_f = FALSE; +static int (*i_nfc_getc)PROTO((FILE *)) = std_getc; /* input of ugetc */ +static int (*i_nfc_ungetc)PROTO((int c ,FILE *f)) = std_ungetc; +STATIC int nfc_getc PROTO((FILE *f)); +STATIC int nfc_ungetc PROTO((int c,FILE *f)); #endif #ifdef INPUT_OPTION @@ -414,7 +423,11 @@ static int (*i_ugetc)PROTO((FILE *)) = std_getc; /* input of ugetc */ static int (*i_uungetc)PROTO((int c ,FILE *f)) = std_ungetc; STATIC int url_getc PROTO((FILE *f)); STATIC int url_ungetc PROTO((int c,FILE *f)); +#endif +#ifdef NUMCHAR_OPTION +#define CLASS_MASK 0x0f000000 +#define CLASS_UTF16 0x01000000 static int numchar_f = FALSE; static int (*i_ngetc)PROTO((FILE *)) = std_getc; /* input of ugetc */ static int (*i_nungetc)PROTO((int c ,FILE *f)) = std_ungetc; @@ -926,6 +939,9 @@ struct { #ifdef X0212_ENABLE {"x0212", ""}, #endif +#ifdef UNICODE_ENABLE + {"internal-unicode", ""}, +#endif #ifdef UTF8_OUTPUT_ENABLE {"utf8", "w"}, {"utf16", "w16"}, @@ -935,6 +951,9 @@ struct { {"utf8-input", "W"}, {"utf16-input", "W16"}, #endif +#ifdef UNICODE_NORMALIZATION + {"utf8mac-input", ""}, +#endif #ifdef OVERWRITE {"overwrite", ""}, #endif @@ -1070,12 +1089,25 @@ options(cp) return; } #endif +#ifdef UNICODE_ENABLE + if (strcmp(long_option[i].name, "internal-unicode") == 0){ + internal_unicode_f = TRUE; + continue; + } +#endif #ifdef UTF8_OUTPUT_ENABLE if (strcmp(long_option[i].name, "ms-ucs-map") == 0){ ms_ucs_map_f = TRUE; continue; } #endif +#ifdef UNICODE_NORMALIZATION + if (strcmp(long_option[i].name, "utf8mac-input") == 0){ + input_f = UTF8_INPUT; + nfc_f = TRUE; + continue; + } +#endif if (strcmp(long_option[i].name, "prefix=") == 0){ if (*p == '=' && ' ' < p[1] && p[1] < 128){ for (i = 2; ' ' < p[i] && p[i] < 128; i++){ @@ -1849,6 +1881,12 @@ module_connection() i_nungetc = i_ungetc; i_ungetc= numchar_ungetc; } #endif +#ifdef UNICODE_NORMALIZATION + if (nfc_f && input_f == UTF8_INPUT){ + i_nfc_getc = i_getc; i_getc = nfc_getc; + i_nfc_ungetc = i_ungetc; i_ungetc= nfc_ungetc; + } +#endif if (mime_f && mimebuf_f==FIXED_MIME) { i_mgetc = i_getc; i_getc = mime_getc; i_mungetc = i_ungetc; i_ungetc = mime_ungetc; @@ -2480,7 +2518,30 @@ w_iconv(c2, c1, c0) int c2, c1, c0; { - int ret = w2e_conv(c2, c1, c0, &c2, &c1); + int ret = 0; + unsigned short val = 0; + + if (c0 == 0){ + if (c2 < 0x80 || (c2 & 0xc0) == 0xdf) /* 0x00-0x7f 0xc0-0xdf */ + ; /* 1 or 2ytes */ + else if ((c2 & 0xf0) == 0xe0) /* 0xe0-0xef */ + return -1; /* 3bytes */ + /*else if (0xf0 <= c2) + return 0; /* 4,5,6bytes */ + else if ((c2 & 0xc0) == 0x80) /* 0x80-0xbf */ + return 0; /* trail byte */ + else return 0; + } + if (c2 == EOF); + else if (c2 == 0xef && c1 == 0xbb && c0 == 0xbf) + return 0; /* throw BOM */ + else if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)){ + val = ww16_conv(c2, c1, c0); + c2 = (val >> 8) & 0xff; + c1 = val & 0xff; + } else { + ret = w2e_conv(c2, c1, c0, &c2, &c1); + } if (ret == 0){ (*oconv)(c2, c1); } @@ -2566,7 +2627,7 @@ int w_iconv16(c2, c1, c0) int c2, c1,c0; { - int ret; + int ret = 0; if (c2==0376 && c1==0377){ utf16_mode = UTF16BE_INPUT; @@ -2583,7 +2644,8 @@ w_iconv16(c2, c1, c0) (*oconv)(c2, c1); return 0; } - ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1); + if (internal_unicode_f && (output_conv == w_oconv || output_conv == w_oconv16)); + else ret = w16e_conv(((c2<<8)&0xff00) + c1, &c2, &c1); if (ret) return ret; (*oconv)(c2, c1); return 0; @@ -2668,6 +2730,7 @@ w_oconv(c2, c1) c1; { int c0; + unsigned short val; if (c2 == EOF) { (*o_putc)(EOF); return; @@ -2699,9 +2762,10 @@ w_oconv(c2, c1) output_mode = ISO8859_1; (*o_putc)(c1 | 0x080); } else { - unsigned short val; output_mode = UTF8; - val = e2w_conv(c2, c1); + if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)) + val = ((c2<<8)&0xff00) + c1; + else val = e2w_conv(c2, c1); if (val){ w16w_conv(val, &c2, &c1, &c0); (*o_putc)(c2); @@ -2734,7 +2798,8 @@ w_oconv16(c2, c1) unicode_bom_f=1; } - if (c2 == ISO8859_1) { + if (internal_unicode_f && (iconv == w_iconv || iconv == w_iconv16)){ + } else if (c2 == ISO8859_1) { c2 = 0; c1 |= 0x80; #ifdef NUMCHAR_OPTION @@ -3862,6 +3927,57 @@ numchar_ungetc(c, f) } #endif +#ifdef UNICODE_NORMALIZATION + +/* Normalization Form C */ +int +nfc_getc(f) + FILE *f; +{ + int (*g)() = i_nfc_getc; + int (*u)() = i_nfc_ungetc; + int i=0, j, k=1, lower, upper; + int buf[9]; + int *array = NULL; + extern struct normalization_pair normalization_table[]; + + buf[i] = (*g)(f); + while (k > 0 && ((buf[i] & 0xc0) != 0x80)){ + lower=0, upper=NORMALIZATION_TABLE_LENGTH-1; + while (upper >= lower) { + j = (lower+upper) / 2; + array = normalization_table[j].nfd; + for (k=0; k < NORMALIZATION_TABLE_NFD_LENGTH && array[k]; k++){ + if (array[k] != buf[k]){ + array[k] < buf[k] ? (lower = j + 1) : (upper = j - 1); + k = 0; + break; + } else if (k >= i) + buf[++i] = (*g)(f); + } + if (k > 0){ + array = normalization_table[j].nfc; + for (i=0; i < NORMALIZATION_TABLE_NFC_LENGTH && array[i]; i++) + buf[i] = array[i]; + i--; + break; + } + } + while (i > 0) + (*u)(buf[i--], f); + } + return buf[0]; +} + +int +nfc_ungetc(c, f) + int c; + FILE *f; +{ + return (*i_nfc_ungetc)(c, f); +} +#endif /* UNICODE_NORMALIZATION */ + int mime_getc(f) @@ -4597,11 +4713,17 @@ reinit() x0201_f = NO_X0201; #endif iso2022jp_f = FALSE; +#ifdef UNICODE_ENABLE + internal_unicode_f = TRUE; +#endif #ifdef UTF8_OUTPUT_ENABLE unicode_bom_f = 0; w_oconv16_LE = 0; ms_ucs_map_f = FALSE; #endif +#ifdef UNICODE_NORMALIZATION + nfc_f = FALSE; +#endif #ifdef INPUT_OPTION cap_f = FALSE; url_f = FALSE; |