summaryrefslogtreecommitdiff
path: root/euc_jp.c
diff options
context:
space:
mode:
authorksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2004-11-04 14:31:26 +0000
committerksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2004-11-04 14:31:26 +0000
commit5e853c811ce1d6d6edc187e580a14133667e1058 (patch)
tree4ecf2cb00a79a481ee5aeda802d5bb73415ca8f5 /euc_jp.c
parent67ae0fb9aced8cf56de10a1fd400a236bd753b60 (diff)
This commit was generated by cvs2svn to compensate for changes in r7203,
which included commits to RCS files with non-trunk default branches. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7204 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
Diffstat (limited to 'euc_jp.c')
-rw-r--r--euc_jp.c161
1 files changed, 101 insertions, 60 deletions
diff --git a/euc_jp.c b/euc_jp.c
index 79c1dca454..a152ccb452 100644
--- a/euc_jp.c
+++ b/euc_jp.c
@@ -1,23 +1,69 @@
/**********************************************************************
-
euc_jp.c - Oniguruma (regular expression library)
-
- Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
#include "regenc.h"
#define eucjp_islead(c) ((UChar )((c) - 0xa1) > 0xfe - 0xa1)
+static int EncLen_EUCJP[] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
+};
+
+static int
+eucjp_mbc_enc_len(UChar* p)
+{
+ return EncLen_EUCJP[*p];
+}
+
static OnigCodePoint
eucjp_mbc_to_code(UChar* p, UChar* end)
{
int c, i, len;
OnigCodePoint n;
- c = *p++;
- len = enc_len(ONIG_ENCODING_EUC_JP, c);
- n = c;
+ len = enc_len(ONIG_ENCODING_EUC_JP, p);
+ n = (OnigCodePoint )*p++;
if (len == 1) return n;
for (i = 1; i < len; i++) {
@@ -31,11 +77,13 @@ eucjp_mbc_to_code(UChar* p, UChar* end)
static int
eucjp_code_to_mbclen(OnigCodePoint code)
{
- if ((code & 0xff0000) != 0) return 3;
+ if (ONIGENC_IS_CODE_ASCII(code)) return 1;
+ else if ((code & 0xff0000) != 0) return 3;
else if ((code & 0xff00) != 0) return 2;
- else return 1;
+ else return 0;
}
+#if 0
static int
eucjp_code_to_mbc_first(OnigCodePoint code)
{
@@ -43,27 +91,16 @@ eucjp_code_to_mbc_first(OnigCodePoint code)
if ((code & 0xff0000) != 0) {
first = (code >> 16) & 0xff;
- /*
- if (enc_len(ONIG_ENCODING_EUC_JP, first) != 3)
- return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
- */
}
else if ((code & 0xff00) != 0) {
first = (code >> 8) & 0xff;
- /*
- if (enc_len(ONIG_ENCODING_EUC_JP, first) != 2)
- return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
- */
}
else {
- /*
- if (enc_len(ONIG_ENCODING_EUC_JP, code) != 1)
- return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
- */
return (int )code;
}
return first;
}
+#endif
static int
eucjp_code_to_mbc(OnigCodePoint code, UChar *buf)
@@ -75,44 +112,57 @@ eucjp_code_to_mbc(OnigCodePoint code, UChar *buf)
*p++ = (UChar )(code & 0xff);
#if 1
- if (enc_len(ONIG_ENCODING_EUC_JP, buf[0]) != (p - buf))
+ if (enc_len(ONIG_ENCODING_EUC_JP, buf) != (p - buf))
return ONIGENCERR_INVALID_WIDE_CHAR_VALUE;
#endif
return p - buf;
}
static int
-eucjp_mbc_to_lower(UChar* p, UChar* lower)
+eucjp_mbc_to_normalize(OnigAmbigType flag, UChar** pp, UChar* end,
+ UChar* lower)
{
int len;
+ UChar* p = *pp;
if (ONIGENC_IS_MBC_ASCII(p)) {
- *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
+ if ((flag & ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE) != 0) {
+ *lower = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);
+ }
+ else {
+ *lower = *p;
+ }
+
+ (*pp)++;
return 1;
}
else {
- len = enc_len(ONIG_ENCODING_EUC_JP, *p);
+ len = enc_len(ONIG_ENCODING_EUC_JP, p);
if (lower != p) {
- /* memcpy(lower, p, len); */
int i;
for (i = 0; i < len; i++) {
*lower++ = *p++;
}
}
+ (*pp) += len;
return len; /* return byte length of converted char to lower */
}
}
static int
-eucjp_code_is_ctype(OnigCodePoint code, unsigned int ctype)
+eucjp_is_mbc_ambiguous(OnigAmbigType flag, UChar** pp, UChar* end)
+{
+ return onigenc_mbn_is_mbc_ambiguous(ONIG_ENCODING_EUC_JP, flag, pp, end);
+}
+
+static int
+eucjp_is_code_ctype(OnigCodePoint code, unsigned int ctype)
{
if ((ctype & ONIGENC_CTYPE_WORD) != 0) {
if (code < 128)
return ONIGENC_IS_ASCII_CODE_CTYPE(code, ctype);
- else {
- int first = eucjp_code_to_mbc_first(code);
- return (enc_len(ONIG_ENCODING_EUC_JP, first) > 1 ? TRUE : FALSE);
- }
+ else
+ return (eucjp_code_to_mbclen(code) > 1 ? TRUE : FALSE);
ctype &= ~ONIGENC_CTYPE_WORD;
if (ctype == 0) return FALSE;
@@ -137,7 +187,7 @@ eucjp_left_adjust_char_head(UChar* start, UChar* s)
p = s;
while (!eucjp_islead(*p) && p > start) p--;
- len = enc_len(ONIG_ENCODING_EUC_JP, *p);
+ len = enc_len(ONIG_ENCODING_EUC_JP, p);
if (p + len > s) return p;
p += len;
return p + ((s - p) & ~1);
@@ -154,38 +204,29 @@ eucjp_is_allowed_reverse_match(UChar* s, UChar* end)
}
OnigEncodingType OnigEncodingEUC_JP = {
+ eucjp_mbc_enc_len,
+ "EUC-JP", /* name */
+ 3, /* max enc length */
+ 1, /* min enc length */
+ ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE,
{
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3,
- 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
- 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
- 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1
+ (OnigCodePoint )'\\' /* esc */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar '.' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anytime '*' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* zero or one time '?' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* one or more time '+' */
+ , (OnigCodePoint )ONIG_INEFFECTIVE_META_CHAR /* anychar anytime */
},
- "EUC-JP", /* name */
- 3, /* max byte length */
- FALSE, /* is_fold_match */
- ONIGENC_CTYPE_SUPPORT_LEVEL_SB, /* ctype_support_level */
- FALSE, /* is continuous sb mb codepoint */
+ onigenc_is_mbc_newline_0x0a,
eucjp_mbc_to_code,
eucjp_code_to_mbclen,
eucjp_code_to_mbc,
- eucjp_mbc_to_lower,
- onigenc_mbn_mbc_is_case_ambig,
- eucjp_code_is_ctype,
- onigenc_nothing_get_ctype_code_range,
+ eucjp_mbc_to_normalize,
+ eucjp_is_mbc_ambiguous,
+ onigenc_ascii_get_all_pair_ambig_codes,
+ onigenc_nothing_get_all_comp_ambig_codes,
+ eucjp_is_code_ctype,
+ onigenc_not_support_get_ctype_code_range,
eucjp_left_adjust_char_head,
- eucjp_is_allowed_reverse_match,
- onigenc_nothing_get_all_fold_match_code,
- onigenc_nothing_get_fold_match_info
+ eucjp_is_allowed_reverse_match
};