summaryrefslogtreecommitdiff
path: root/enc
diff options
context:
space:
mode:
authornaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-01-29 11:44:08 +0000
committernaruse <naruse@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2008-01-29 11:44:08 +0000
commitb9821b02a0c5c48fbde23cea6576e0f9e044af78 (patch)
treebbc2bebe54b74afa74a7ceec399d5805e5518e76 /enc
parent7a8c02cd47750decd6c7507d8c5f090a8d3b4605 (diff)
* enc/trans/make_transdb.rb: add for make transdb.h.
* dmytranscode.c: add for miniruby. * enc/gbk.c (gbk_left_adjust_char_head, gbk_is_allowed_reverse_match): fix odd regexp match. [ruby-dev:33502] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@15321 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'enc')
-rw-r--r--enc/gbk.c47
1 files changed, 35 insertions, 12 deletions
diff --git a/enc/gbk.c b/enc/gbk.c
index a2153f8c74..787b1815e0 100644
--- a/enc/gbk.c
+++ b/enc/gbk.c
@@ -29,7 +29,7 @@
#include "regenc.h"
-static const int EncLen_gbk[] = {
+static const int EncLen_GBK[] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
@@ -48,6 +48,28 @@ static const int EncLen_gbk[] = {
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
};
+static const char GBK_CAN_BE_TRAIL_TABLE[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
+};
+
+#define GBK_ISMB_FIRST(byte) (EncLen_GBK[byte] > 1)
+#define GBK_ISMB_TRAIL(byte) GBK_CAN_BE_TRAIL_TABLE[(byte)]
+
typedef enum { FAILURE = -2, ACCEPT = -1, S0 = 0, S1 } state_t;
#define A ACCEPT
#define F FAILURE
@@ -101,7 +123,7 @@ gbk_mbc_enc_len(const UChar* p, const UChar* e, OnigEncoding enc ARG_UNUSED)
return s == ACCEPT ? ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(n) : \
ONIGENC_CONSTRUCT_MBCLEN_INVALID()
if (s < 0) RETURN(1);
- if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_gbk[firstbyte]-1);
+ if (p == e) return ONIGENC_CONSTRUCT_MBCLEN_NEEDMORE(EncLen_GBK[firstbyte]-1);
s = trans[s][*p++];
RETURN(2);
#undef RETURN
@@ -142,21 +164,23 @@ gbk_is_code_ctype(OnigCodePoint code, unsigned int ctype, OnigEncoding enc)
return onigenc_mb2_is_code_ctype(enc, code, ctype);
}
-#define gbk_islead(c) ((c) < 0xa1 || (c) == 0xff)
-
static UChar*
gbk_left_adjust_char_head(const UChar* start, const UChar* s, OnigEncoding enc)
{
- /* Assumed in this encoding,
- mb-trail bytes don't mix with single bytes.
- */
const UChar *p;
int len;
if (s <= start) return (UChar* )s;
p = s;
- while (!gbk_islead(*p) && p > start) p--;
+ if (GBK_ISMB_TRAIL(*p)) {
+ while (p > start) {
+ if (! GBK_ISMB_FIRST(*--p)) {
+ p++;
+ break;
+ }
+ }
+ }
len = enclen(enc, p, s);
if (p + len > s) return (UChar* )p;
p += len;
@@ -167,13 +191,12 @@ static int
gbk_is_allowed_reverse_match(const UChar* s, const UChar* end ARG_UNUSED, OnigEncoding enc ARG_UNUSED)
{
const UChar c = *s;
- if (c <= 0x7e) return TRUE;
- else return FALSE;
+ return (GBK_ISMB_TRAIL(c) ? FALSE : TRUE);
}
-OnigEncodingDefine(gbk, gbk) = {
+OnigEncodingDefine(gbk, GBK) = {
gbk_mbc_enc_len,
- "GBK", /* name */
+ "GBK", /* name */
2, /* max enc length */
1, /* min enc length */
onigenc_is_mbc_newline_0x0a,