summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog14
-rw-r--r--ext/nkf/lib/kconv.rb29
-rw-r--r--ext/nkf/nkf-utf8/nkf.c85
-rw-r--r--ext/nkf/nkf.c34
4 files changed, 131 insertions, 31 deletions
diff --git a/ChangeLog b/ChangeLog
index 6d53f30a44..bc32af7cd8 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+Fri Sep 15 20:22:15 2006 NARUSE, Yui <naruse@ruby-lang.org>
+
+ * ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 rev.110.
+ * Fix: check_bom cuts \xfe\xff\xXX\xXX of UTF-32LE.
+ * Add support --ic=UTF-32.
+ * Fix: can't guess UTF-16 and UTF-32.
+ * Fix: can't decode beyond BMP of UTF-16LE.
+
+ * ext/nkf/nkf.c (guess): Support UTF-32.
+
+ * ext/nkf/lib/kconv.rb (kconv): Support UTF-32.
+
+ * ext/nkf/lib/kconv.rb (to_utf32): new method.
+
Fri Sep 15 05:23:24 2006 NARUSE, Yui <naruse@ruby-lang.org>
* ext/nkf/nkf-8/nkf.c: imported nkf 2.0.8 2006-09-15.
diff --git a/ext/nkf/lib/kconv.rb b/ext/nkf/lib/kconv.rb
index 4ffe8d984e..91553228fe 100644
--- a/ext/nkf/lib/kconv.rb
+++ b/ext/nkf/lib/kconv.rb
@@ -105,6 +105,8 @@ module Kconv
opt << 'W'
when ::NKF::UTF16
opt << 'W16'
+ when ::NKF::UTF32
+ opt << 'W32'
end
case out_code
@@ -118,6 +120,8 @@ module Kconv
opt << 'w'
when ::NKF::UTF16
opt << 'w16'
+ when ::NKF::UTF32
+ opt << 'w32'
when ::NKF::NOCONV
return str
end
@@ -202,6 +206,20 @@ module Kconv
end
module_function :toutf16
+ # call-seq:
+ # Kconv.toutf32(str) -> string
+ #
+ # Convert <code>str</code> to UTF-32
+ #
+ # *Note*
+ # This method decode MIME encoded string and
+ # convert halfwidth katakana to fullwidth katakana.
+ # If you don't want it, use NKF.nkf('-w32xm0', str).
+ def toutf32(str)
+ ::NKF::nkf('-w32m', str)
+ end
+ module_function :toutf32
+
#
# guess
#
@@ -337,6 +355,17 @@ class String
# If you don't want it, use NKF.nkf('-w16xm0', str).
def toutf16; Kconv.toutf16(self) end
+ # call-seq:
+ # String#toutf32 -> string
+ #
+ # Convert <code>self</code> to UTF-32
+ #
+ # *Note*
+ # This method decode MIME encoded string and
+ # convert halfwidth katakana to fullwidth katakana.
+ # If you don't want it, use NKF.nkf('-w32xm0', str).
+ def toutf32; Kconv.toutf32(self) end
+
#
# is Encoding
#
diff --git a/ext/nkf/nkf-utf8/nkf.c b/ext/nkf/nkf-utf8/nkf.c
index 2f3da8b373..bd2e90c77c 100644
--- a/ext/nkf/nkf-utf8/nkf.c
+++ b/ext/nkf/nkf-utf8/nkf.c
@@ -581,6 +581,8 @@ struct input_code input_code_list[] = {
{"Shift_JIS", 0, 0, 0, {0, 0, 0}, s_status, s_iconv, 0},
#ifdef UTF8_INPUT_ENABLE
{"UTF-8", 0, 0, 0, {0, 0, 0}, w_status, w_iconv, 0},
+ {"UTF-16", 0, 0, 0, {0, 0, 0}, NULL, w_iconv16, 0},
+ {"UTF-32", 0, 0, 0, {0, 0, 0}, NULL, w_iconv32, 0},
#endif
{0}
};
@@ -1293,6 +1295,15 @@ void options(unsigned char *cp)
strcmp(codeset, "UTF-16LE-BOM") == 0){
input_f = UTF16_INPUT;
input_endian = ENDIAN_LITTLE;
+ }else if(strcmp(codeset, "UTF-32") == 0 ||
+ strcmp(codeset, "UTF-32BE") == 0 ||
+ strcmp(codeset, "UTF-32BE-BOM") == 0){
+ input_f = UTF32_INPUT;
+ input_endian = ENDIAN_BIG;
+ }else if(strcmp(codeset, "UTF-32LE") == 0 ||
+ strcmp(codeset, "UTF-32LE-BOM") == 0){
+ input_f = UTF32_INPUT;
+ input_endian = ENDIAN_LITTLE;
#endif
}
continue;
@@ -1901,12 +1912,7 @@ void options(unsigned char *cp)
}
}
-#ifdef ANSI_C_PROTOTYPE
struct input_code * find_inputcode_byfunc(nkf_char (*iconv_func)(nkf_char c2,nkf_char c1,nkf_char c0))
-#else
-struct input_code * find_inputcode_byfunc(iconv_func)
- nkf_char (*iconv_func)();
-#endif
{
if (iconv_func){
struct input_code *p = input_code_list;
@@ -2227,6 +2233,12 @@ void code_status(nkf_char c)
struct input_code *result = 0;
struct input_code *p = input_code_list;
while (p->name){
+ if (!p->status_func) {
+ ++p;
+ continue;
+ }
+ if (!p->status_func)
+ continue;
(p->status_func)(p, c);
if (p->stat > 0){
action_flag = 0;
@@ -2407,8 +2419,11 @@ void check_bom(FILE *f)
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_BIG;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_BIG;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFE,f);
}else if(c2 == 0xFF){
@@ -2416,8 +2431,11 @@ void check_bom(FILE *f)
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_2143;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_2143;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
@@ -2431,7 +2449,10 @@ void check_bom(FILE *f)
if(!input_f){
set_iconv(TRUE, w_iconv);
}
- return;
+ if (iconv == w_iconv) {
+ return;
+ }
+ (*i_ungetc)(0xBF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xBB,f);
}else (*i_ungetc)(c2,f);
@@ -2444,16 +2465,22 @@ void check_bom(FILE *f)
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_3412;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_3412;
+ return;
+ }
+ (*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv16);
}
- input_endian = ENDIAN_BIG;
- return;
+ if (iconv == w_iconv16) {
+ input_endian = ENDIAN_BIG;
+ return;
+ }
+ (*i_ungetc)(0xFF,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFE,f);
break;
@@ -2464,16 +2491,22 @@ void check_bom(FILE *f)
if(!input_f){
set_iconv(TRUE, w_iconv32);
}
- input_endian = ENDIAN_LITTLE;
- return;
+ if (iconv == w_iconv32) {
+ input_endian = ENDIAN_LITTLE;
+ return;
+ }
+ (*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0x00,f);
}else (*i_ungetc)(c2,f);
if(!input_f){
set_iconv(TRUE, w_iconv16);
}
- input_endian = ENDIAN_LITTLE;
- return;
+ if (iconv == w_iconv16) {
+ input_endian = ENDIAN_LITTLE;
+ return;
+ }
+ (*i_ungetc)(0xFE,f);
}else (*i_ungetc)(c2,f);
(*i_ungetc)(0xFF,f);
break;
@@ -2557,21 +2590,21 @@ nkf_char kanji_convert(FILE *f)
c0 <<= 8;
if ((c3 = (*i_getc)(f)) != EOF) {
c0 |= c3;
- } else c1 = EOF;
- } else c1 = EOF;
+ } else c2 = EOF;
+ } else c2 = EOF;
}
- }
+ } else c2 = EOF;
} else {
if ((c2 = (*i_getc)(f)) != EOF) {
if (0xD8 <= c2 && c2 <= 0xDB) {
if ((c3 = (*i_getc)(f)) != EOF) {
- c3 <<= 8;
if ((c0 = (*i_getc)(f)) != EOF) {
+ c0 <<= 8;
c0 |= c3;
- } else c1 = EOF;
- } else c1 = EOF;
+ } else c2 = EOF;
+ } else c2 = EOF;
}
- } else c1 = EOF;
+ } else c2 = EOF;
}
SEND;
} else if(iconv == w_iconv32){
@@ -2595,7 +2628,7 @@ nkf_char kanji_convert(FILE *f)
}
c2 = 0;
}else{
- c1 = EOF;
+ c2 = EOF;
}
SEND;
} else
diff --git a/ext/nkf/nkf.c b/ext/nkf/nkf.c
index 8a4bcfce64..e12df16960 100644
--- a/ext/nkf/nkf.c
+++ b/ext/nkf/nkf.c
@@ -306,6 +306,8 @@ rb_nkf_guess1(VALUE obj, VALUE src)
* "UTF-8"
* when NKF::UTF16
* "UTF-16"
+ * when NKF::UTF32
+ * "UTF-32"
* when NKF::UNKNOWN
* "UNKNOWN"
* when NKF::BINARY
@@ -345,6 +347,8 @@ rb_nkf_guess2(VALUE obj, VALUE src)
code = _UTF8;
} else if (strcmp(input_codename, "UTF-16") == 0) {
code = _UTF16;
+ } else if (strcmp(input_codename, "UTF-32") == 0) {
+ code = _UTF32;
} else if (strlen(input_codename) > 0) {
code = _UNKNOWN;
}
@@ -382,16 +386,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
*
* Output is buffered (DEFAULT), Output is unbuffered.
*
- * === -j -s -e -w -w16
+ * === -j -s -e -w -w16 -w32
*
* Output code is ISO-2022-JP (7bit JIS), Shift_JIS, EUC-JP,
- * UTF-8N, UTF-16BE.
+ * UTF-8N, UTF-16BE, UTF-32BE.
* Without this option and compile option, ISO-2022-JP is assumed.
*
- * === -J -S -E -W -W16
+ * === -J -S -E -W -W16 -W32
*
* Input assumption is JIS 7 bit, Shift_JIS, EUC-JP,
- * UTF-8, UTF-16LE.
+ * UTF-8, UTF-16, UTF-32.
*
* ==== -J
*
@@ -574,6 +578,16 @@ rb_nkf_guess2(VALUE obj, VALUE src)
*
* [UTF-16LE-BOM] UTF-16 Little Endian with BOM
*
+ * [UTF-32] same as UTF-32BE
+ *
+ * [UTF-32BE] UTF-32 Big Endian without BOM
+ *
+ * [UTF-32BE-BOM] UTF-32 Big Endian with BOM
+ *
+ * [UTF-32LE] UTF-32 Little Endian without BOM
+ *
+ * [UTF-32LE-BOM] UTF-32 Little Endian with BOM
+ *
* [UTF8-MAC] NKDed UTF-8, a.k.a. UTF8-NFD (input only)
*
* === --fb-{skip, html, xml, perl, java, subchar}
@@ -587,10 +601,20 @@ rb_nkf_guess2(VALUE obj, VALUE src)
* nkf adds a specified escape character to specified 2nd byte of Shift_JIS characters.
* 1st byte of argument is the escape character and following bytes are target characters.
*
- * === --disable-cp932ext
+ * === --no-cp932ext
*
* Handle the characters extended in CP932 as unassigned characters.
*
+ * == --no-best-fit-chars
+ *
+ * When Unicode to Encoded byte conversion,
+ * don't convert characters which is not round trip safe.
+ * When Unicode to Unicode conversion,
+ * with this and -x option, nkf can be used as UTF converter.
+ * (In other words, without this and -x option, nkf doesn't save some characters)
+ *
+ * When nkf convert string which related to path, you should use this opion.
+ *
* === --cap-input
*
* Decode hex encoded characters.