summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2004-11-04 14:43:08 +0000
committerksaito <ksaito@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>2004-11-04 14:43:08 +0000
commit82cb9eaa3bb49a77df4452cfdff18f817ecf63a6 (patch)
tree62fb3445ee466b5710d977707c048a0f26c5781d
parent5e853c811ce1d6d6edc187e580a14133667e1058 (diff)
* ascii.c, euc_jp.c, oniggnu.h, oniguruma.h, regcomp.c, regenc.c, regenc.h, regerror.c, regexec.c, reggnu.c, regint.h, regparse.c, regparse.h, sjis.c, utf8.c:
imported Oni Guruma 3.4.0. * parse.y, re.c: Now mbclen() takes unsigned char as its argument. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7206 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
-rw-r--r--ChangeLog10
-rw-r--r--oniguruma.h296
-rw-r--r--parse.y46
-rw-r--r--re.c31
-rw-r--r--regcomp.c816
-rw-r--r--regexec.c813
-rw-r--r--regint.h116
-rw-r--r--regparse.c942
-rw-r--r--regparse.h100
9 files changed, 1884 insertions, 1286 deletions
diff --git a/ChangeLog b/ChangeLog
index 70dcf2a8e7..d8cacbab8c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,13 @@
+Thu Nov 4 23:41:55 2004 Kazuo Saito <ksaito@uranus.dti.ne.jp>
+
+ * ascii.c, euc_jp.c, oniggnu.h, oniguruma.h, regcomp.c,
+ regenc.c, regenc.h, regerror.c, regexec.c, reggnu.c,
+ regint.h, regparse.c, regparse.h, sjis.c, utf8.c:
+ imported Oni Guruma 3.4.0.
+
+ * parse.y, re.c: Now mbclen() takes unsigned char as
+ its argument.
+
Thu Nov 4 21:25:38 2004 Yukihiro Matsumoto <matz@ruby-lang.org>
* string.c (str_gsub): string modify check no longer based on
diff --git a/oniguruma.h b/oniguruma.h
index 3fd9f4c395..c10f3b4d18 100644
--- a/oniguruma.h
+++ b/oniguruma.h
@@ -1,17 +1,38 @@
+#ifndef ONIGURUMA_H
+#define ONIGURUMA_H
/**********************************************************************
-
oniguruma.h - Oniguruma (regular expression library)
-
- Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
-#ifndef ONIGURUMA_H
-#define ONIGURUMA_H
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
#define ONIGURUMA
-#define ONIGURUMA_VERSION_MAJOR 2
-#define ONIGURUMA_VERSION_MINOR 2
-#define ONIGURUMA_VERSION_TEENY 8
+#define ONIGURUMA_VERSION_MAJOR 3
+#define ONIGURUMA_VERSION_MINOR 4
+#define ONIGURUMA_VERSION_TEENY 0
#ifndef P_
#if defined(__STDC__) || defined(_WIN32)
@@ -56,12 +77,56 @@ typedef struct {
OnigCodePoint to;
} OnigCodePointRange;
-#define ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE 16
+
+/* ambiguous match flag */
+#define ONIGENC_AMBIGUOUS_MATCH_NONE 0
+#define ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE (1<<0)
+#define ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE (1<<1)
+/* #define ONIGENC_AMBIGUOUS_MATCH_ACCENT (1<<2) */
+/* #define ONIGENC_AMBIGUOUS_MATCH_HIRAGANA_KATAKANA (1<<3) */
+/* #define ONIGENC_AMBIGUOUS_MATCH_KATAKANA_WIDTH (1<<4) */
+
+#define ONIGENC_AMBIGUOUS_MATCH_LIMIT (1<<1)
+#define ONIGENC_AMBIGUOUS_MATCH_COMPOUND (1<<30)
+
+#define ONIGENC_AMBIGUOUS_MATCH_FULL \
+ ( ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \
+ ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \
+ ONIGENC_AMBIGUOUS_MATCH_COMPOUND )
+#define ONIGENC_AMBIGUOUS_MATCH_DEFAULT \
+ (ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE | \
+ ONIGENC_AMBIGUOUS_MATCH_NONASCII_CASE | \
+ ONIGENC_AMBIGUOUS_MATCH_COMPOUND )
+
+typedef unsigned int OnigAmbigType;
+
+#define ONIGENC_MAX_COMP_AMBIG_CODE_LEN 3
+#define ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM 4
+
+typedef struct {
+ int len;
+ OnigCodePoint code[ONIGENC_MAX_COMP_AMBIG_CODE_LEN];
+} OnigCompAmbigCodeItem;
+
typedef struct {
- int target_num;
- int target_byte_len[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE];
- UChar* target_str[ONIGENC_FOLD_MATCH_MAX_TARGET_NUM_SIZE];
-} OnigEncFoldMatchInfo;
+ int n;
+ OnigCodePoint code;
+ OnigCompAmbigCodeItem items[ONIGENC_MAX_COMP_AMBIG_CODE_ITEM_NUM];
+} OnigCompAmbigCodes;
+
+typedef struct {
+ OnigCodePoint from;
+ OnigCodePoint to;
+} OnigPairAmbigCodes;
+
+typedef struct {
+ OnigCodePoint esc;
+ OnigCodePoint anychar;
+ OnigCodePoint anytime;
+ OnigCodePoint zero_or_one_time;
+ OnigCodePoint one_or_more_time;
+ OnigCodePoint anychar_anytime;
+} OnigMetaCharTableType;
#if defined(RUBY_PLATFORM) && defined(M17N_H)
@@ -72,23 +137,24 @@ typedef m17n_encoding* OnigEncoding;
#else
typedef struct {
- const char len_table[256];
- const char* name;
- int max_enc_len;
- int is_fold_match;
- int ctype_support_level; /* sb-only/full */
- int is_continuous_sb_mb; /* code point is continuous from sb to mb */
+ int (*mbc_enc_len)(UChar* p);
+ const char* name;
+ int max_enc_len;
+ int min_enc_len;
+ OnigAmbigType support_ambig_flag;
+ OnigMetaCharTableType meta_char_table;
+ int (*is_mbc_newline)(UChar* p, UChar* end);
OnigCodePoint (*mbc_to_code)(UChar* p, UChar* end);
int (*code_to_mbclen)(OnigCodePoint code);
int (*code_to_mbc)(OnigCodePoint code, UChar *buf);
- int (*mbc_to_lower)(UChar* p, UChar* lower);
- int (*mbc_is_case_ambig)(UChar* p);
- int (*code_is_ctype)(OnigCodePoint code, unsigned int ctype);
+ int (*mbc_to_normalize)(OnigAmbigType flag, UChar** pp, UChar* end, UChar* to);
+ int (*is_mbc_ambiguous)(OnigAmbigType flag, UChar** pp, UChar* end);
+ int (*get_all_pair_ambig_codes)(OnigAmbigType flag, OnigPairAmbigCodes** acs);
+ int (*get_all_comp_ambig_codes)(OnigAmbigType flag, OnigCompAmbigCodes** acs);
+ int (*is_code_ctype)(OnigCodePoint code, unsigned int ctype);
int (*get_ctype_code_range)(int ctype, int* nsb, int* nmb, OnigCodePointRange* sbr[], OnigCodePointRange* mbr[]);
- UChar* (*left_adjust_char_head)(UChar* start, UChar* s);
- int (*is_allowed_reverse_match)(UChar* p, UChar* e);
- int (*get_all_fold_match_code)(OnigCodePoint** codes);
- int (*get_fold_match_info)(UChar* p, UChar* end, OnigEncFoldMatchInfo** info);
+ UChar* (*left_adjust_char_head)(UChar* start, UChar* p);
+ int (*is_allowed_reverse_match)(UChar* p, UChar* end);
} OnigEncodingType;
typedef OnigEncodingType* OnigEncoding;
@@ -110,6 +176,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_14;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_15;
ONIG_EXTERN OnigEncodingType OnigEncodingISO_8859_16;
ONIG_EXTERN OnigEncodingType OnigEncodingUTF8;
+ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_BE;
+ONIG_EXTERN OnigEncodingType OnigEncodingUTF16_LE;
+ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_BE;
+ONIG_EXTERN OnigEncodingType OnigEncodingUTF32_LE;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_JP;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_TW;
ONIG_EXTERN OnigEncodingType OnigEncodingEUC_KR;
@@ -136,6 +206,10 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5;
#define ONIG_ENCODING_ISO_8859_15 (&OnigEncodingISO_8859_15)
#define ONIG_ENCODING_ISO_8859_16 (&OnigEncodingISO_8859_16)
#define ONIG_ENCODING_UTF8 (&OnigEncodingUTF8)
+#define ONIG_ENCODING_UTF16_BE (&OnigEncodingUTF16_BE)
+#define ONIG_ENCODING_UTF16_LE (&OnigEncodingUTF16_LE)
+#define ONIG_ENCODING_UTF32_BE (&OnigEncodingUTF32_BE)
+#define ONIG_ENCODING_UTF32_LE (&OnigEncodingUTF32_LE)
#define ONIG_ENCODING_EUC_JP (&OnigEncodingEUC_JP)
#define ONIG_ENCODING_EUC_TW (&OnigEncodingEUC_TW)
#define ONIG_ENCODING_EUC_KR (&OnigEncodingEUC_KR)
@@ -151,35 +225,32 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5;
/* work size */
-#define ONIGENC_CODE_TO_MBC_MAXLEN 7
-#define ONIGENC_MBC_TO_LOWER_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN
+#define ONIGENC_CODE_TO_MBC_MAXLEN 7
+#define ONIGENC_MBC_NORMALIZE_MAXLEN ONIGENC_CODE_TO_MBC_MAXLEN
/* character types */
-#define ONIGENC_CTYPE_ALPHA (1<< 0)
-#define ONIGENC_CTYPE_BLANK (1<< 1)
-#define ONIGENC_CTYPE_CNTRL (1<< 2)
-#define ONIGENC_CTYPE_DIGIT (1<< 3)
-#define ONIGENC_CTYPE_GRAPH (1<< 4)
-#define ONIGENC_CTYPE_LOWER (1<< 5)
-#define ONIGENC_CTYPE_PRINT (1<< 6)
-#define ONIGENC_CTYPE_PUNCT (1<< 7)
-#define ONIGENC_CTYPE_SPACE (1<< 8)
-#define ONIGENC_CTYPE_UPPER (1<< 9)
-#define ONIGENC_CTYPE_XDIGIT (1<<10)
-#define ONIGENC_CTYPE_WORD (1<<11)
-#define ONIGENC_CTYPE_ASCII (1<<12)
+#define ONIGENC_CTYPE_NEWLINE (1<< 0)
+#define ONIGENC_CTYPE_ALPHA (1<< 1)
+#define ONIGENC_CTYPE_BLANK (1<< 2)
+#define ONIGENC_CTYPE_CNTRL (1<< 3)
+#define ONIGENC_CTYPE_DIGIT (1<< 4)
+#define ONIGENC_CTYPE_GRAPH (1<< 5)
+#define ONIGENC_CTYPE_LOWER (1<< 6)
+#define ONIGENC_CTYPE_PRINT (1<< 7)
+#define ONIGENC_CTYPE_PUNCT (1<< 8)
+#define ONIGENC_CTYPE_SPACE (1<< 9)
+#define ONIGENC_CTYPE_UPPER (1<<10)
+#define ONIGENC_CTYPE_XDIGIT (1<<11)
+#define ONIGENC_CTYPE_WORD (1<<12)
+#define ONIGENC_CTYPE_ASCII (1<<13)
#define ONIGENC_CTYPE_ALNUM (ONIGENC_CTYPE_ALPHA | ONIGENC_CTYPE_DIGIT)
-/* ctype support level */
-#define ONIGENC_CTYPE_SUPPORT_LEVEL_SB 0
-#define ONIGENC_CTYPE_SUPPORT_LEVEL_FULL 1
-
-#define enc_len(enc,byte) ONIGENC_MBC_LEN_BY_HEAD(enc,byte)
+#define enc_len(enc,p) ONIGENC_MBC_ENC_LEN(enc,p)
#define ONIGENC_IS_UNDEF(enc) ((enc) == ONIG_ENCODING_UNDEF)
#define ONIGENC_IS_SINGLEBYTE(enc) (ONIGENC_MBC_MAXLEN(enc) == 1)
-#define ONIGENC_IS_MBC_HEAD(enc,byte) (ONIGENC_MBC_LEN_BY_HEAD(enc,byte) != 1)
+#define ONIGENC_IS_MBC_HEAD(enc,p) (ONIGENC_MBC_ENC_LEN(enc,p) != 1)
#define ONIGENC_IS_MBC_ASCII(p) (*(p) < 128)
#define ONIGENC_IS_CODE_ASCII(code) ((code) < 128)
#define ONIGENC_IS_CODE_SB_WORD(enc,code) \
@@ -192,31 +263,33 @@ ONIG_EXTERN OnigEncodingType OnigEncodingBIG5;
#include <ctype.h> /* for isblank(), isgraph() */
-#define ONIGENC_MBC_TO_LOWER(enc,p,buf) onigenc_mbc_to_lower(enc,p,buf)
-#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) onigenc_mbc_is_case_ambig(enc,p)
+#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \
+ onigenc_mbc_to_normalize(enc,flag,pp,end,buf)
+#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \
+ onigenc_is_mbc_ambiguous(enc,flag,pp,end)
-#define ONIGENC_IS_FOLD_MATCH(enc) FALSE
-#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) FALSE
-#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ONIGENC_CTYPE_SUPPORT_LEVEL_SB
+#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ONIGENC_AMBIGUOUS_MATCH_ASCII_CASE
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \
onigenc_is_allowed_reverse_match(enc, s, end)
#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \
onigenc_get_left_adjust_char_head(enc, start, s)
-#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) 0
-#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) ONIG_NO_SUPPORT_CONFIG
+#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, ambig_flag, acs) 0
+#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, ambig_flag, acs) 0
#define ONIGENC_GET_CTYPE_CODE_RANGE(enc,ctype,nsb,nmb,sbr,mbr) \
ONIG_NO_SUPPORT_CONFIG
-#define ONIGENC_MBC_LEN_BY_HEAD(enc,b) m17n_mbclen(enc,(int )b)
+#define ONIGENC_MBC_ENC_LEN(enc,p) m17n_mbclen(enc,(int )(*p))
#define ONIGENC_MBC_MAXLEN(enc) m17n_mbmaxlen(enc)
#define ONIGENC_MBC_MAXLEN_DIST(enc) \
(ONIGENC_MBC_MAXLEN(enc) > 0 ? ONIGENC_MBC_MAXLEN(enc) \
: ONIG_INFINITE_DISTANCE)
+#define ONIGENC_MBC_MINLEN(enc) 1
#define ONIGENC_MBC_TO_CODE(enc,p,e) m17n_codepoint((enc),(p),(e))
#define ONIGENC_CODE_TO_MBCLEN(enc,code) m17n_codelen((enc),(code))
#define ONIGENC_CODE_TO_MBC(enc,code,buf) onigenc_code_to_mbc(enc, code, buf)
-#if 0
-#define ONIGENC_STEP_BACK(enc,start,s,n) /* !! not supported !! */
+#if 0 /* !! not supported !! */
+#define ONIGENC_IS_MBC_NEWLINE(enc,p,end)
+#define ONIGENC_STEP_BACK(enc,start,s,n)
#endif
#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) \
@@ -251,9 +324,9 @@ int onigenc_is_code_ctype P_((OnigEncoding enc, OnigCodePoint code, int ctype));
ONIG_EXTERN
int onigenc_code_to_mbc P_((OnigEncoding enc, OnigCodePoint code, UChar *buf));
ONIG_EXTERN
-int onigenc_mbc_to_lower P_((OnigEncoding enc, UChar* p, UChar* buf));
+int onigenc_mbc_to_normalize P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end, UChar* buf));
ONIG_EXTERN
-int onigenc_mbc_is_case_ambig P_((OnigEncoding enc, UChar* p));
+int onigenc_is_mbc_ambiguous P_((OnigEncoding enc, OnigAmbigType flag, UChar** pp, UChar* end));
ONIG_EXTERN
int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end));
@@ -261,32 +334,35 @@ int onigenc_is_allowed_reverse_match P_((OnigEncoding enc, UChar* s, UChar* end)
#define ONIGENC_NAME(enc) ((enc)->name)
-#define ONIGENC_MBC_TO_LOWER(enc,p,buf) (enc)->mbc_to_lower(p,buf)
-#define ONIGENC_IS_MBC_CASE_AMBIG(enc,p) (enc)->mbc_is_case_ambig(p)
-
-#define ONIGENC_IS_FOLD_MATCH(enc) ((enc)->is_fold_match)
-#define ONIGENC_IS_CONTINUOUS_SB_MB(enc) ((enc)->is_continuous_sb_mb)
-#define ONIGENC_CTYPE_SUPPORT_LEVEL(enc) ((enc)->ctype_support_level)
+#define ONIGENC_MBC_TO_NORMALIZE(enc,flag,pp,end,buf) \
+ (enc)->mbc_to_normalize(flag,pp,end,buf)
+#define ONIGENC_IS_MBC_AMBIGUOUS(enc,flag,pp,end) \
+ (enc)->is_mbc_ambiguous(flag,pp,end)
+#define ONIGENC_SUPPORT_AMBIG_FLAG(enc) ((enc)->support_ambig_flag)
#define ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc,s,end) \
(enc)->is_allowed_reverse_match(s,end)
#define ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc,start,s) \
(enc)->left_adjust_char_head(start, s)
-#define ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc,codes) \
- (enc)->get_all_fold_match_code(codes)
-#define ONIGENC_GET_FOLD_MATCH_INFO(enc,p,end,info) \
- (enc)->get_fold_match_info(p,end,info)
+#define ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc,ambig_flag,acs) \
+ (enc)->get_all_pair_ambig_codes(ambig_flag,acs)
+#define ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc,ambig_flag,acs) \
+ (enc)->get_all_comp_ambig_codes(ambig_flag,acs)
#define ONIGENC_STEP_BACK(enc,start,s,n) \
onigenc_step_back((enc),(start),(s),(n))
-#define ONIGENC_MBC_LEN_BY_HEAD(enc,byte) ((enc)->len_table[(int )(byte)])
+#define ONIGENC_MBC_ENC_LEN(enc,p) (enc)->mbc_enc_len(p)
#define ONIGENC_MBC_MAXLEN(enc) ((enc)->max_enc_len)
#define ONIGENC_MBC_MAXLEN_DIST(enc) ONIGENC_MBC_MAXLEN(enc)
-#define ONIGENC_MBC_TO_CODE(enc,p,e) (enc)->mbc_to_code((p),(e))
+#define ONIGENC_MBC_MINLEN(enc) ((enc)->min_enc_len)
+#define ONIGENC_IS_MBC_NEWLINE(enc,p,end) (enc)->is_mbc_newline((p),(end))
+#define ONIGENC_MBC_TO_CODE(enc,p,end) (enc)->mbc_to_code((p),(end))
#define ONIGENC_CODE_TO_MBCLEN(enc,code) (enc)->code_to_mbclen(code)
#define ONIGENC_CODE_TO_MBC(enc,code,buf) (enc)->code_to_mbc(code,buf)
-#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->code_is_ctype(code,ctype)
+#define ONIGENC_IS_CODE_CTYPE(enc,code,ctype) (enc)->is_code_ctype(code,ctype)
+#define ONIGENC_IS_CODE_NEWLINE(enc,code) \
+ ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_NEWLINE)
#define ONIGENC_IS_CODE_GRAPH(enc,code) \
ONIGENC_IS_CODE_CTYPE(enc,code,ONIGENC_CTYPE_GRAPH)
#define ONIGENC_IS_CODE_PRINT(enc,code) \
@@ -340,6 +416,12 @@ ONIG_EXTERN
UChar* onigenc_get_left_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s));
ONIG_EXTERN
UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UChar* s));
+ONIG_EXTERN
+int onigenc_strlen P_((OnigEncoding enc, UChar* p, UChar* end));
+ONIG_EXTERN
+int onigenc_strlen_null P_((OnigEncoding enc, UChar* p));
+ONIG_EXTERN
+int onigenc_str_bytelen_null P_((OnigEncoding enc, UChar* p));
@@ -353,13 +435,6 @@ UChar* onigenc_get_right_adjust_char_head P_((OnigEncoding enc, UChar* start, UC
/* constants */
#define ONIG_MAX_ERROR_MESSAGE_LEN 90
-#if defined(RUBY_PLATFORM) && !defined(ONIG_RUBY_M17N)
-ONIG_EXTERN OnigEncoding OnigEncDefaultCharEncoding;
-#undef ismbchar
-#define ismbchar(c) (mbclen((c)) != 1)
-#define mbclen(c) (OnigEncDefaultCharEncoding->len_table[(unsigned char )(c)])
-#endif
-
typedef unsigned int OnigOptionType;
#define ONIG_OPTION_DEFAULT ONIG_OPTION_NONE
@@ -467,6 +542,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY (1<<16) /* \p{...}, \P{...} */
#define ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT (1<<17) /* \p{^..}, \P{^..} */
#define ONIG_SYN_OP2_CHAR_PROPERTY_PREFIX_IS (1<<18) /* \p{IsXDigit} */
+#define ONIG_SYN_OP2_ESC_H_XDIGIT (1<<19) /* \h, \H */
/* syntax (behavior) */
#define ONIG_SYN_CONTEXT_INDEP_ANCHORS (1<<31) /* not implemented */
@@ -479,6 +555,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND (1<<6) /* (?<=a|bc) */
#define ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP (1<<7) /* see doc/RE */
#define ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME (1<<8) /* (?<x>)(?<x>) */
+#define ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY (1<<9) /* a{n}?=(?:a{n})? */
/* syntax (behavior) in char class [...] */
#define ONIG_SYN_NOT_NEWLINE_IN_NEGATIVE_CC (1<<20) /* [^...] */
@@ -565,6 +642,7 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIGERR_INVALID_CHAR_PROPERTY_NAME -223
#define ONIGERR_INVALID_WIDE_CHAR_VALUE -400
#define ONIGERR_TOO_BIG_WIDE_CHAR_VALUE -401
+#define ONIGERR_NOT_SUPPORTED_ENCODING_COMBINATION -402
/* errors related to thread */
#define ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT -1001
@@ -575,6 +653,15 @@ ONIG_EXTERN OnigSyntaxType* OnigDefaultSyntax;
#define ONIG_IS_CAPTURE_HISTORY_GROUP(r, i) \
((i) <= ONIG_MAX_CAPTURE_HISTORY_GROUP && (r)->list && (r)->list[i])
+typedef struct OnigCaptureTreeNodeStruct {
+ int group; /* group number */
+ int beg;
+ int end;
+ int allocated;
+ int num_childs;
+ struct OnigCaptureTreeNodeStruct** childs;
+} OnigCaptureTreeNode;
+
/* match result region type */
struct re_registers {
int allocated;
@@ -582,9 +669,16 @@ struct re_registers {
int* beg;
int* end;
/* extended */
- struct re_registers** list; /* capture history. list[1]-list[31] */
+ OnigCaptureTreeNode* history_root; /* capture history tree root */
};
+/* capture tree traverse */
+#define ONIG_TRAVERSE_CALLBACK_AT_FIRST 1
+#define ONIG_TRAVERSE_CALLBACK_AT_LAST 2
+#define ONIG_TRAVERSE_CALLBACK_AT_BOTH \
+ ( ONIG_TRAVERSE_CALLBACK_AT_FIRST | ONIG_TRAVERSE_CALLBACK_AT_LAST )
+
+
#define ONIG_REGION_NOTPOS -1
typedef struct re_registers OnigRegion;
@@ -635,6 +729,7 @@ typedef struct re_pattern_buffer {
OnigEncoding enc;
OnigOptionType options;
OnigSyntaxType* syntax;
+ OnigAmbigType ambig_flag;
void* name_table;
/* optimization info (string search, char-map and anchors) */
@@ -646,7 +741,7 @@ typedef struct re_pattern_buffer {
int sub_anchor; /* start-anchor for exact or map */
unsigned char *exact;
unsigned char *exact_end;
- unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */
+ unsigned char map[ONIG_CHAR_TABLE_SIZE]; /* used as BM skip or char-map */
int *int_map; /* BM skip for exact_len > 255 */
int *int_map_backward; /* BM skip for backward search */
OnigDistance dmin; /* min-distance of exact or map */
@@ -657,6 +752,15 @@ typedef struct re_pattern_buffer {
} regex_t;
+typedef struct {
+ int num_of_elements;
+ OnigEncoding pattern_enc;
+ OnigEncoding target_enc;
+ OnigSyntaxType* syntax;
+ OnigOptionType option;
+ OnigAmbigType ambig_flag;
+} OnigCompileInfo;
+
/* Oniguruma Native API */
ONIG_EXTERN
int onig_init P_((void));
@@ -669,10 +773,14 @@ void onig_set_verb_warn_func P_((OnigWarnFunc f));
ONIG_EXTERN
int onig_new P_((regex_t**, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
ONIG_EXTERN
+int onig_new_deluxe P_((regex_t** reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo));
+ONIG_EXTERN
void onig_free P_((regex_t*));
ONIG_EXTERN
int onig_recompile P_((regex_t*, UChar* pattern, UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo));
ONIG_EXTERN
+int onig_recompile_deluxe P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigCompileInfo* ci, OnigErrorInfo* einfo));
+ONIG_EXTERN
int onig_search P_((regex_t*, UChar* str, UChar* end, UChar* start, UChar* range, OnigRegion* region, OnigOptionType option));
ONIG_EXTERN
int onig_match P_((regex_t*, UChar* str, UChar* end, UChar* at, OnigRegion* region, OnigOptionType option));
@@ -696,16 +804,34 @@ int onig_foreach_name P_((regex_t* reg, int (*func)(UChar*,UChar*,int,int*,regex
ONIG_EXTERN
int onig_number_of_names P_((regex_t* reg));
ONIG_EXTERN
+int onig_number_of_captures P_((regex_t* reg));
+ONIG_EXTERN
+int onig_number_of_capture_histories P_((regex_t* reg));
+ONIG_EXTERN
+OnigCaptureTreeNode* onig_get_capture_tree P_((OnigRegion* region));
+ONIG_EXTERN
+int onig_capture_tree_traverse P_((OnigRegion* region, int at, int(*callback_func)(int,int,int,int,int,void*), void* arg));
+ONIG_EXTERN
OnigEncoding onig_get_encoding P_((regex_t* reg));
ONIG_EXTERN
OnigOptionType onig_get_options P_((regex_t* reg));
ONIG_EXTERN
+OnigAmbigType onig_get_ambig_flag P_((regex_t* reg));
+ONIG_EXTERN
OnigSyntaxType* onig_get_syntax P_((regex_t* reg));
ONIG_EXTERN
int onig_set_default_syntax P_((OnigSyntaxType* syntax));
ONIG_EXTERN
void onig_copy_syntax P_((OnigSyntaxType* to, OnigSyntaxType* from));
ONIG_EXTERN
+unsigned int onig_get_syntax_op P_((OnigSyntaxType* syntax));
+ONIG_EXTERN
+unsigned int onig_get_syntax_op2 P_((OnigSyntaxType* syntax));
+ONIG_EXTERN
+unsigned int onig_get_syntax_behavior P_((OnigSyntaxType* syntax));
+ONIG_EXTERN
+OnigOptionType onig_get_syntax_options P_((OnigSyntaxType* syntax));
+ONIG_EXTERN
void onig_set_syntax_op P_((OnigSyntaxType* syntax, unsigned int op));
ONIG_EXTERN
void onig_set_syntax_op2 P_((OnigSyntaxType* syntax, unsigned int op2));
@@ -714,7 +840,9 @@ void onig_set_syntax_behavior P_((OnigSyntaxType* syntax, unsigned int behavior)
ONIG_EXTERN
void onig_set_syntax_options P_((OnigSyntaxType* syntax, OnigOptionType options));
ONIG_EXTERN
-int onig_set_meta_char P_((unsigned int what, OnigCodePoint code));
+int onig_set_meta_char P_((OnigEncoding enc, unsigned int what, OnigCodePoint code));
+ONIG_EXTERN
+void onig_copy_encoding P_((OnigEncoding to, OnigEncoding from));
ONIG_EXTERN
unsigned int onig_get_match_stack_limit_size P_((void));
ONIG_EXTERN
@@ -723,5 +851,7 @@ ONIG_EXTERN
int onig_end P_((void));
ONIG_EXTERN
const char* onig_version P_((void));
+ONIG_EXTERN
+const char* onig_copyright P_((void));
#endif /* ONIGURUMA_H */
diff --git a/parse.y b/parse.y
index 8e80142d39..051bee456f 100644
--- a/parse.y
+++ b/parse.y
@@ -4853,8 +4853,10 @@ parser_tokadd_string(parser, func, term, paren, nest)
long *nest;
{
int c;
+ unsigned char uc;
while ((c = nextc()) != -1) {
+ uc = (unsigned char)c;
if (paren && c == paren) {
++*nest;
}
@@ -4905,8 +4907,8 @@ parser_tokadd_string(parser, func, term, paren, nest)
}
}
}
- else if (ismbchar(c)) {
- int i, len = mbclen(c)-1;
+ else if (ismbchar(uc)) {
+ int i, len = mbclen(uc)-1;
for (i = 0; i < len; i++) {
tokadd(c);
@@ -5002,6 +5004,7 @@ parser_heredoc_identifier(parser)
struct parser_params *parser;
{
int c = nextc(), term, func = 0, len;
+ unsigned int uc;
if (c == '-') {
c = nextc();
@@ -5019,7 +5022,8 @@ parser_heredoc_identifier(parser)
tokadd(func);
term = c;
while ((c = nextc()) != -1 && c != term) {
- len = mbclen(c);
+ uc = (unsigned int)c;
+ len = mbclen(uc);
do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1);
}
if (c == -1) {
@@ -5029,7 +5033,8 @@ parser_heredoc_identifier(parser)
break;
default:
- if (!is_identchar(c)) {
+ uc = (unsigned int)c;
+ if (!is_identchar(uc)) {
pushback(c);
if (func & STR_FUNC_INDENT) {
pushback('-');
@@ -5040,9 +5045,11 @@ parser_heredoc_identifier(parser)
term = '"';
tokadd(func |= str_dquote);
do {
- len = mbclen(c);
+ uc = (unsigned int)c;
+ len = mbclen(uc);
do {tokadd(c);} while (--len > 0 && (c = nextc()) != -1);
- } while ((c = nextc()) != -1 && is_identchar(c));
+ } while ((c = nextc()) != -1 &&
+ (uc = (unsigned char)c, is_identchar(uc)));
pushback(c);
break;
}
@@ -5233,6 +5240,7 @@ parser_yylex(parser)
register int c;
int space_seen = 0;
int cmd_state;
+ unsigned char uc;
#ifdef RIPPER
int fallthru = Qfalse;
#endif
@@ -5519,6 +5527,7 @@ parser_yylex(parser)
rb_compile_error(PARSER_ARG "incomplete character syntax");
return 0;
}
+ uc = (unsigned char)c;
if (ISSPACE(c)){
if (!IS_ARG()){
int c2 = 0;
@@ -5551,7 +5560,7 @@ parser_yylex(parser)
lex_state = EXPR_TERNARY;
return '?';
}
- else if (ismbchar(c)) {
+ else if (ismbchar(uc)) {
rb_warnI("multibyte character literal not supported yet; use ?\\%.3o", c);
goto ternary;
}
@@ -6098,7 +6107,8 @@ parser_yylex(parser)
}
else {
term = nextc();
- if (ISALNUM(term) || ismbchar(term)) {
+ uc = (unsigned char)c;
+ if (ISALNUM(term) || ismbchar(uc)) {
yyerror("unknown type of %string");
return 0;
}
@@ -6177,7 +6187,8 @@ parser_yylex(parser)
switch (c) {
case '_': /* $_: last read line string */
c = nextc();
- if (is_identchar(c)) {
+ uc = (unsigned char)c;
+ if (is_identchar(uc)) {
tokadd('$');
tokadd('_');
break;
@@ -6243,7 +6254,8 @@ parser_yylex(parser)
return tNTH_REF;
default:
- if (!is_identchar(c)) {
+ uc = (unsigned char)c;
+ if (!is_identchar(uc)) {
pushback(c);
return '$';
}
@@ -6268,7 +6280,8 @@ parser_yylex(parser)
rb_compile_error(PARSER_ARG "`@@%c' is not allowed as a class variable name", c);
}
}
- if (!is_identchar(c)) {
+ uc = (unsigned char)c;
+ if (!is_identchar(uc)) {
pushback(c);
return '@';
}
@@ -6290,7 +6303,8 @@ parser_yylex(parser)
break;
default:
- if (!is_identchar(c)) {
+ uc = (unsigned char)c;
+ if (!is_identchar(uc)) {
rb_compile_error(PARSER_ARG "Invalid char `\\%03o' in expression", c);
goto retry;
}
@@ -6299,10 +6313,11 @@ parser_yylex(parser)
break;
}
+ uc = (unsigned char)c;
do {
tokadd(c);
- if (ismbchar(c)) {
- int i, len = mbclen(c)-1;
+ if (ismbchar(uc)) {
+ int i, len = mbclen(uc)-1;
for (i = 0; i < len; i++) {
c = nextc();
@@ -6310,7 +6325,8 @@ parser_yylex(parser)
}
}
c = nextc();
- } while (is_identchar(c));
+ uc = (unsigned char)c;
+ } while (is_identchar(uc));
if ((c == '!' || c == '?') && is_identchar(tok()[0]) && !peek('=')) {
tokadd(c);
}
diff --git a/re.c b/re.c
index daa179d819..59b80a9b04 100644
--- a/re.c
+++ b/re.c
@@ -248,11 +248,12 @@ rb_reg_mbclen2(c, re)
VALUE re;
{
int len;
+ unsigned char uc = (unsigned char)c;
if (!FL_TEST(re, KCODE_FIXED))
- return mbclen(c);
+ return mbclen(uc);
kcode_set_option(re);
- len = mbclen(c);
+ len = mbclen(uc);
kcode_reset_option();
return len;
}
@@ -1775,8 +1776,8 @@ rb_reg_quote(str)
send = s + RSTRING(str)->len;
for (; s < send; s++) {
c = *s;
- if (ismbchar(c)) {
- int n = mbclen(c);
+ if (ismbchar(*s)) {
+ int n = mbclen(*s);
while (n-- && s < send)
s++;
@@ -1804,8 +1805,8 @@ rb_reg_quote(str)
for (; s < send; s++) {
c = *s;
- if (ismbchar(c)) {
- int n = mbclen(c);
+ if (ismbchar(*s)) {
+ int n = mbclen(*s);
while (n-- && s < send)
*t++ = *s++;
@@ -2044,21 +2045,23 @@ rb_reg_regsub(str, src, regs)
struct re_registers *regs;
{
VALUE val = 0;
- char *p, *s, *e, c;
+ char *p, *s, *e;
+ unsigned char uc;
int no;
+
p = s = RSTRING(str)->ptr;
e = s + RSTRING(str)->len;
while (s < e) {
char *ss = s;
- c = *s++;
- if (ismbchar(c)) {
- s += mbclen(c) - 1;
+ uc = (unsigned char)*s++;
+ if (ismbchar(uc)) {
+ s += mbclen(uc) - 1;
continue;
}
- if (c != '\\' || s == e) continue;
+ if (uc != '\\' || s == e) continue;
if (!val) {
val = rb_str_buf_new(ss-p);
@@ -2068,12 +2071,12 @@ rb_reg_regsub(str, src, regs)
rb_str_buf_cat(val, p, ss-p);
}
- c = *s++;
+ uc = (unsigned char)*s++;
p = s;
- switch (c) {
+ switch (uc) {
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
- no = c - '0';
+ no = uc - '0';
break;
case '&':
no = 0;
diff --git a/regcomp.c b/regcomp.c
index 3b62e18b43..3db7b3ad6a 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -1,16 +1,42 @@
/**********************************************************************
-
regcomp.c - Oniguruma (regular expression library)
-
- Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
#include "regparse.h"
#ifndef PLATFORM_UNALIGNED_WORD_ACCESS
static unsigned char PadBuf[WORD_ALIGNMENT_SIZE];
#endif
+/*
+ Caution: node should not be a string node.
+ (s and end member address break)
+*/
static void
swap_node(Node* a, Node* b)
{
@@ -120,33 +146,6 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node)
#endif /* USE_SUBEXP_CALL */
-#if 0
-static int
-bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc)
-{
- int i;
- int len, maxlen = 0;
-
- if (negative) {
- for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
- if (! BITSET_AT(bs, i)) {
- len = enc_len(enc, i);
- if (len > maxlen) maxlen = len;
- }
- }
- }
- else {
- for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
- if (BITSET_AT(bs, i)) {
- len = enc_len(enc, i);
- if (len > maxlen) maxlen = len;
- }
- }
- }
- return maxlen;
-}
-#endif
-
static int
add_opcode(regex_t* reg, int opcode)
{
@@ -293,15 +292,15 @@ select_str_opcode(int mb_len, int str_len, int ignore_case)
{
int op;
- switch (mb_len) {
- case 1:
- if (ignore_case) {
- switch (str_len) {
- case 1: op = OP_EXACT1_IC; break;
- default: op = OP_EXACTN_IC; break;
- }
+ if (ignore_case) {
+ switch (str_len) {
+ case 1: op = OP_EXACT1_IC; break;
+ default: op = OP_EXACTN_IC; break;
}
- else {
+ }
+ else {
+ switch (mb_len) {
+ case 1:
switch (str_len) {
case 1: op = OP_EXACT1; break;
case 2: op = OP_EXACT2; break;
@@ -310,25 +309,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case)
case 5: op = OP_EXACT5; break;
default: op = OP_EXACTN; break;
}
- }
- break;
+ break;
- case 2:
- switch (str_len) {
- case 1: op = OP_EXACTMB2N1; break;
- case 2: op = OP_EXACTMB2N2; break;
- case 3: op = OP_EXACTMB2N3; break;
- default: op = OP_EXACTMB2N; break;
- }
- break;
+ case 2:
+ switch (str_len) {
+ case 1: op = OP_EXACTMB2N1; break;
+ case 2: op = OP_EXACTMB2N2; break;
+ case 3: op = OP_EXACTMB2N3; break;
+ default: op = OP_EXACTMB2N; break;
+ }
+ break;
- case 3:
- op = OP_EXACTMB3N;
- break;
+ case 3:
+ op = OP_EXACTMB3N;
+ break;
- default:
- op = OP_EXACTMBN;
- break;
+ default:
+ op = OP_EXACTMBN;
+ break;
+ }
}
return op;
}
@@ -373,7 +372,7 @@ compile_call(CallNode* node, regex_t* reg)
r = add_opcode(reg, OP_CALL);
if (r) return r;
r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg),
- node->target);
+ node->target);
if (r) return r;
r = add_abs_addr(reg, 0 /*dummy addr.*/);
return r;
@@ -394,15 +393,14 @@ compile_tree_n_times(Node* node, int n, regex_t* reg)
static int
add_compile_string_length(UChar* s, int mb_len, int str_len,
- regex_t* reg, int ignore_case)
+ regex_t* reg, int ignore_case)
{
int len;
int op = select_str_opcode(mb_len, str_len, ignore_case);
len = SIZE_OPCODE;
- if (op == OP_EXACTMBN)
- len += SIZE_LENGTH;
+ if (op == OP_EXACTMBN) len += SIZE_LENGTH;
if (IS_NEED_STR_LEN_OP_EXACT(op))
len += SIZE_LENGTH;
@@ -412,7 +410,7 @@ add_compile_string_length(UChar* s, int mb_len, int str_len,
static int
add_compile_string(UChar* s, int mb_len, int str_len,
- regex_t* reg, int ignore_case)
+ regex_t* reg, int ignore_case)
{
int op = select_str_opcode(mb_len, str_len, ignore_case);
add_opcode(reg, op);
@@ -420,8 +418,12 @@ add_compile_string(UChar* s, int mb_len, int str_len,
if (op == OP_EXACTMBN)
add_length(reg, mb_len);
- if (IS_NEED_STR_LEN_OP_EXACT(op))
- add_length(reg, str_len);
+ if (IS_NEED_STR_LEN_OP_EXACT(op)) {
+ if (op == OP_EXACTN_IC)
+ add_length(reg, mb_len * str_len);
+ else
+ add_length(reg, str_len);
+ }
add_bytes(reg, s, mb_len * str_len);
return 0;
@@ -429,49 +431,37 @@ add_compile_string(UChar* s, int mb_len, int str_len,
static int
-compile_length_string_node(StrNode* sn, regex_t* reg)
+compile_length_string_node(Node* node, regex_t* reg)
{
- int rlen, r, len, prev_len, slen, ambig, ic;
+ int rlen, r, len, prev_len, slen, ambig;
OnigEncoding enc = reg->enc;
UChar *p, *prev;
+ StrNode* sn;
+ sn = &(NSTRING(node));
if (sn->end <= sn->s)
return 0;
- ic = IS_IGNORECASE(reg->options);
+ ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
- prev_len = enc_len(enc, *p);
- if (ic != 0 && prev_len == 1)
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
- else
- ambig = 0;
-
+ prev_len = enc_len(enc, p);
p += prev_len;
slen = 1;
rlen = 0;
for (; p < sn->end; ) {
- len = enc_len(enc, *p);
+ len = enc_len(enc, p);
if (len == prev_len) {
slen++;
- if (ic != 0 && ambig == 0 && len == 1)
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
}
else {
r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
rlen += r;
-
- if (ic != 0 && len == 1)
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
- else
- ambig = 0;
-
prev = p;
slen = 1;
prev_len = len;
}
-
p += len;
}
r = add_compile_string_length(prev, prev_len, slen, reg, ambig);
@@ -489,49 +479,33 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg)
}
static int
-compile_string_node(StrNode* sn, regex_t* reg)
+compile_string_node(Node* node, regex_t* reg)
{
- int r, len, prev_len, slen, ambig, ic;
+ int r, len, prev_len, slen, ambig;
OnigEncoding enc = reg->enc;
- UChar *p, *prev;
+ UChar *p, *prev, *end;
+ StrNode* sn;
+ sn = &(NSTRING(node));
if (sn->end <= sn->s)
return 0;
- ic = IS_IGNORECASE(reg->options);
+ end = sn->end;
+ ambig = NSTRING_IS_AMBIG(node);
p = prev = sn->s;
- prev_len = enc_len(enc, *p);
- if (ic != 0 && prev_len == 1) {
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
- if (ambig != 0)
- ONIGENC_MBC_TO_LOWER(reg->enc, p, p);
- }
- else
- ambig = 0;
-
+ prev_len = enc_len(enc, p);
p += prev_len;
slen = 1;
- for (; p < sn->end; ) {
- len = enc_len(enc, *p);
+ for (; p < end; ) {
+ len = enc_len(enc, p);
if (len == prev_len) {
slen++;
- if (ic != 0 && len == 1) {
- if (ambig == 0)
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
- if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p);
- }
}
else {
r = add_compile_string(prev, prev_len, slen, reg, ambig);
if (r) return r;
- if (ic != 0 && len == 1) {
- ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p);
- if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p);
- }
- else
- ambig = 0;
prev = p;
slen = 1;
@@ -584,8 +558,7 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg)
len = SIZE_OPCODE + SIZE_BITSET;
}
else {
- if (bitset_is_empty(cc->bs)) {
- /* SIZE_BITSET is included in mbuf->used. */
+ if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
len = SIZE_OPCODE;
}
else {
@@ -613,7 +586,7 @@ compile_cclass_node(CClassNode* cc, regex_t* reg)
r = add_bitset(reg, cc->bs);
}
else {
- if (bitset_is_empty(cc->bs)) {
+ if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) {
if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT);
else add_opcode(reg, OP_CCLASS_MB);
@@ -649,7 +622,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper)
int n;
n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC;
p = (OnigRepeatRange* )xrealloc(reg->repeat_range,
- sizeof(OnigRepeatRange) * n);
+ sizeof(OnigRepeatRange) * n);
CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY);
reg->repeat_range = p;
reg->repeat_range_alloc = n;
@@ -665,7 +638,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper)
static int
compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info,
- regex_t* reg)
+ regex_t* reg)
{
int r;
int num_repeat = reg->num_repeat;
@@ -685,15 +658,16 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info,
if (r) return r;
if (
- #ifdef USE_SUBEXP_CALL
+#ifdef USE_SUBEXP_CALL
reg->num_call > 0 ||
- #endif
+#endif
IS_QUALIFIER_IN_REPEAT(qn)) {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG);
}
else {
r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG);
}
+
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
return r;
@@ -715,9 +689,9 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg)
if (NTYPE(qn->target) == N_ANYCHAR) {
if (qn->greedy && infinite) {
if (IS_NOT_NULL(qn->next_head_exact))
- return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
+ return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower;
else
- return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
+ return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower;
}
}
@@ -750,7 +724,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg)
len = SIZE_OP_JUMP + tlen;
}
else if (!infinite && qn->greedy &&
- (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) {
+ (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
+ <= QUALIFIER_EXPAND_LIMIT_SIZE)) {
len = tlen * qn->lower;
len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower);
}
@@ -874,7 +849,8 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg)
r = compile_tree(qn->target, reg);
}
else if (!infinite && qn->greedy &&
- (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) {
+ (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper
+ <= QUALIFIER_EXPAND_LIMIT_SIZE)) {
int n = qn->upper - qn->lower;
r = compile_tree_n_times(qn->target, qn->lower, reg);
@@ -934,18 +910,16 @@ compile_option_node(EffectNode* node, regex_t* reg)
if (r) return r;
r = add_opcode(reg, OP_FAIL);
if (r) return r;
+ }
- reg->options = node->option;
- r = compile_tree(node->target, reg);
- reg->options = prev;
+ reg->options = node->option;
+ r = compile_tree(node->target, reg);
+ reg->options = prev;
+
+ if (IS_DYNAMIC_OPTION(prev ^ node->option)) {
if (r) return r;
r = add_opcode_option(reg, OP_SET_OPTION, prev);
}
- else {
- reg->options = node->option;
- r = compile_tree(node->target, reg);
- reg->options = prev;
- }
return r;
}
@@ -992,7 +966,7 @@ compile_length_effect_node(EffectNode* node, regex_t* reg)
break;
case EFFECT_STOP_BACKTRACK:
- if (IS_EFFECT_SIMPLE_REPEAT(node)) {
+ if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
QualifierNode* qn = &NQUALIFIER(node->target);
tlen = compile_length_tree(qn->target, reg);
if (tlen < 0) return tlen;
@@ -1082,7 +1056,7 @@ compile_effect_node(EffectNode* node, regex_t* reg)
break;
case EFFECT_STOP_BACKTRACK:
- if (IS_EFFECT_SIMPLE_REPEAT(node)) {
+ if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) {
QualifierNode* qn = &NQUALIFIER(node->target);
r = compile_tree_n_times(qn->target, qn->lower, reg);
if (r) return r;
@@ -1267,7 +1241,7 @@ compile_length_tree(Node* node, regex_t* reg)
if (NSTRING_IS_RAW(node))
r = compile_length_string_raw_node(&(NSTRING(node)), reg);
else
- r = compile_length_string_node(&(NSTRING(node)), reg);
+ r = compile_length_string_node(node, reg);
break;
case N_CCLASS:
@@ -1365,7 +1339,7 @@ compile_tree(Node* node, regex_t* reg)
if (NSTRING_IS_RAW(node))
r = compile_string_raw_node(&(NSTRING(node)), reg);
else
- r = compile_string_node(&(NSTRING(node)), reg);
+ r = compile_string_node(node, reg);
break;
case N_CCLASS:
@@ -1421,8 +1395,14 @@ compile_tree(Node* node, regex_t* reg)
}
else {
int* p;
- add_opcode(reg, (IS_IGNORECASE(reg->options) ?
- OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI));
+
+ if (IS_IGNORECASE(reg->options)) {
+ add_opcode(reg, OP_BACKREF_MULTI_IC);
+ }
+ else {
+ add_opcode(reg, OP_BACKREF_MULTI);
+ }
+
if (r) return r;
add_length(reg, br->back_num);
if (r) return r;
@@ -2053,7 +2033,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level)
StrNode* sn = &(NSTRING(node));
UChar *s = sn->s;
while (s < sn->end) {
- s += enc_len(reg->enc, *s);
+ s += enc_len(reg->enc, s);
(*len)++;
}
}
@@ -2144,7 +2124,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc)
{
int found;
- if (code >= SINGLE_BYTE_SIZE) {
+ if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) {
if (IS_NULL(cc->mbuf)) {
found = 0;
}
@@ -2309,7 +2289,7 @@ is_not_included(Node* x, Node* y, regex_t* reg)
CClassNode* cc = &(NCCLASS(y));
code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s,
- xs->s + enc_len(reg->enc, c));
+ xs->s + ONIGENC_MBC_MAXLEN(reg->enc));
return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1);
}
break;
@@ -2320,18 +2300,9 @@ is_not_included(Node* x, Node* y, regex_t* reg)
StrNode* ys = &(NSTRING(y));
len = NSTRING_LEN(x);
if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y);
- if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) {
- UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN];
- UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN];
- int plen, qlen;
- for (p = ys->s, q = xs->s; q < xs->end; ) {
- plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow);
- qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow);
- if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0)
- return 1;
- p += enc_len(reg->enc, *p);
- q += enc_len(reg->enc, *q);
- }
+ if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) {
+ /* tiny version */
+ return 0;
}
else {
for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) {
@@ -2388,8 +2359,12 @@ get_head_value_node(Node* node, int exact, regex_t* reg)
if (exact != 0 &&
!NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
- if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s))
+#if 0
+ UChar* tmp = sn->s;
+ if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag,
+ &tmp, sn->end))
n = node;
+#endif
}
else {
n = node;
@@ -2946,7 +2921,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg)
if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) {
Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK);
CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY);
- SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT);
+ SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT);
swap_node(node, en);
NEFFECT(node).target = en;
}
@@ -2965,9 +2940,114 @@ next_setup(Node* node, Node* next_node, regex_t* reg)
return 0;
}
-#define IN_ALT (1<<0)
-#define IN_NOT (1<<1)
-#define IN_REPEAT (1<<2)
+static int
+divide_ambig_string_node(Node* node, regex_t* reg)
+{
+ StrNode* sn = &NSTRING(node);
+ int ambig, prev_ambig;
+ UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp;
+ Node *snode;
+ Node *root = NULL_NODE;
+ Node **tailp = (Node** )0;
+
+ start = prev_start = p = sn->s;
+ end = sn->end;
+ if (p >= end) return 0;
+
+ prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end);
+
+ while (p < end) {
+ prev = p;
+ if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc,
+ reg->ambig_flag, &p, end))) {
+
+ if (prev_ambig != 0) {
+ tmp = prev_start;
+ wp = prev_start;
+ while (tmp < prev) {
+ wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
+ &tmp, end, wp);
+ }
+ snode = onig_node_new_str(prev_start, wp);
+ CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
+ NSTRING_SET_AMBIG(snode);
+ if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode);
+ }
+ else {
+ snode = onig_node_new_str(prev_start, prev);
+ CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
+ }
+
+ if (tailp == (Node** )0) {
+ root = onig_node_new_list(snode, NULL);
+ CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY);
+ tailp = &(NCONS(root).right);
+ }
+ else {
+ *tailp = onig_node_new_list(snode, NULL);
+ CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY);
+ tailp = &(NCONS(*tailp).right);
+ }
+
+ prev_ambig = ambig;
+ prev_start = prev;
+ }
+ }
+
+ if (prev_start == start) {
+ if (prev_ambig != 0) {
+ NSTRING_SET_AMBIG(node);
+ tmp = start;
+ wp = start;
+ while (tmp < end) {
+ wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
+ &tmp, end, wp);
+ }
+ if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node);
+ sn->end = wp;
+ }
+ }
+ else {
+ if (prev_ambig != 0) {
+ tmp = prev_start;
+ wp = prev_start;
+ while (tmp < end) {
+ wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag,
+ &tmp, end, wp);
+ }
+ snode = onig_node_new_str(prev_start, wp);
+ CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
+ NSTRING_SET_AMBIG(snode);
+ if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode);
+ }
+ else {
+ snode = onig_node_new_str(prev_start, end);
+ CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
+ }
+
+ if (tailp == (Node** )0) {
+ root = onig_node_new_list(snode, NULL);
+ CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY);
+ tailp = &(NCONS(node).right);
+ }
+ else {
+ *tailp = onig_node_new_list(snode, NULL);
+ CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY);
+ tailp = &(NCONS(*tailp).right);
+ }
+
+ swap_node(node, root);
+ onig_node_str_clear(root); /* should be after swap! */
+ onig_node_free(root); /* free original string node */
+ }
+
+ return 0;
+}
+
+#define IN_ALT (1<<0)
+#define IN_NOT (1<<1)
+#define IN_REPEAT (1<<2)
+#define IN_VAR_REPEAT (1<<3)
/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
@@ -3005,33 +3085,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
break;
case N_CCLASS:
- if (IS_IGNORECASE(reg->options)) {
- int i;
- UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
- BitSetRef bs = NCCLASS(node).bs;
- for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
- c = (UChar )i;
- ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf);
- if (*lowbuf != c) {
- if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf);
- if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c);
- }
- }
- }
break;
case N_STRING:
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
- StrNode* sn = &NSTRING(node);
- UChar* p = sn->s;
-
- while (p < sn->end) {
- if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) {
- NSTRING_SET_CASE_AMBIG(node);
- break;
- }
- p += enc_len(reg->enc, *p);
- }
+ r = divide_ambig_string_node(node, reg);
}
break;
@@ -3067,9 +3125,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
Node* target = qn->target;
if ((state & IN_REPEAT) != 0) {
- qn->state |= NST_IN_REPEAT;
+ qn->state |= NST_IN_REPEAT;
}
-
+
if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) {
r = get_min_match_length(target, &d, env);
if (r) break;
@@ -3096,8 +3154,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
}
}
+ state |= IN_REPEAT;
if (qn->lower != qn->upper)
- state |= IN_REPEAT;
+ state |= IN_VAR_REPEAT;
r = setup_tree(target, reg, state, env);
if (r) break;
@@ -3154,11 +3213,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
break;
case EFFECT_MEMORY:
- if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) {
+ if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) {
BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum);
/* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */
}
- /* fall */
+ r = setup_tree(en->target, reg, state, env);
+ break;
+
case EFFECT_STOP_BACKTRACK:
{
Node* target = en->target;
@@ -3169,7 +3230,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
tqn->greedy != 0) { /* (?>a*), a*+ etc... */
int qtype = NTYPE(tqn->target);
if (IS_NODE_TYPE_SIMPLE(qtype))
- SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT);
+ SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT);
}
}
}
@@ -3241,26 +3302,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
/* set skip map for Boyer-Moor search */
static int
-set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case,
+set_bm_skip(UChar* s, UChar* end, OnigEncoding enc,
UChar skip[], int** int_skip)
{
int i, len;
- UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
len = end - s;
if (len < ONIG_CHAR_TABLE_SIZE) {
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len;
- if (ignore_case) {
- for (i = 0; i < len - 1; i++) {
- ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf);
- skip[*lowbuf] = len - 1 - i;
- }
- }
- else {
- for (i = 0; i < len - 1; i++)
- skip[s[i]] = len - 1 - i;
- }
+ for (i = 0; i < len - 1; i++)
+ skip[s[i]] = len - 1 - i;
}
else {
if (IS_NULL(*int_skip)) {
@@ -3269,16 +3321,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case,
}
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len;
- if (ignore_case) {
- for (i = 0; i < len - 1; i++) {
- ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf);
- (*int_skip)[*lowbuf] = len - 1 - i;
- }
- }
- else {
- for (i = 0; i < len - 1; i++)
- (*int_skip)[s[i]] = len - 1 - i;
- }
+ for (i = 0; i < len - 1; i++)
+ (*int_skip)[s[i]] = len - 1 - i;
}
return 0;
}
@@ -3291,11 +3335,12 @@ typedef struct {
} MinMaxLen;
typedef struct {
- MinMaxLen mmd;
- BitStatusType backrefed_status;
- OnigEncoding enc;
- OnigOptionType options;
- ScanEnv* scan_env;
+ MinMaxLen mmd;
+ BitStatusType backrefed_status;
+ OnigEncoding enc;
+ OnigOptionType options;
+ OnigAmbigType ambig_flag;
+ ScanEnv* scan_env;
} OptEnv;
typedef struct {
@@ -3332,31 +3377,31 @@ typedef struct {
OptMapInfo map; /* boundary */
} NodeOptInfo;
+static short int ByteValTable[] = {
+ 14, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5,
+ 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5,
+ 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+ 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1
+};
static int
map_position_value(int i)
{
- static int vals[] = {
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10,
- 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
- 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5,
- 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5,
- 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
- 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10,
- };
-
- if (i < sizeof(vals)/sizeof(vals[0])) return vals[i];
-
- return 7; /* Take it easy. */
+ if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0]))
+ return (int )ByteValTable[i];
+ else
+ return 4; /* Take it easy. */
}
static int
distance_value(MinMaxLen* mm)
{
/* 1000 / (min-max-dist + 1) */
- static int dist_vals[] = {
+ static short int dist_vals[] = {
1000, 500, 333, 250, 200, 167, 143, 125, 111, 100,
91, 83, 77, 71, 67, 63, 59, 56, 53, 50,
48, 45, 43, 42, 40, 38, 37, 36, 34, 33,
@@ -3376,7 +3421,7 @@ distance_value(MinMaxLen* mm)
d = mm->max - mm->min;
if (d < sizeof(dist_vals)/sizeof(dist_vals[0]))
/* return dist_vals[d] * 16 / (mm->min + 12); */
- return dist_vals[d];
+ return (int )dist_vals[d];
else
return 1;
}
@@ -3432,12 +3477,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from)
to->max = distance_add(to->max, from->max);
}
+#if 0
static void
add_len_mml(MinMaxLen* to, OnigDistance len)
{
to->min = distance_add(to->min, len);
to->max = distance_add(to->max, len);
}
+#endif
static void
alt_merge_mml(MinMaxLen* to, MinMaxLen* from)
@@ -3584,7 +3631,7 @@ concat_opt_exact_info_str(OptExactInfo* to,
to->s[i++] = *p++;
}
else {
- len = enc_len(enc, *p);
+ len = enc_len(enc, p);
if (i + len > OPT_EXACT_MAXLEN) break;
for (j = 0; j < len; j++)
to->s[i++] = *p++;
@@ -3611,7 +3658,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
for (i = 0; i < to->len && i < add->len; ) {
if (to->s[i] != add->s[i]) break;
- len = enc_len(env->enc, to->s[i]);
+ len = enc_len(env->enc, to->s + i);
for (j = 1; j < len; j++) {
if (to->s[i+j] != add->s[i+j]) break;
@@ -3633,12 +3680,24 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env)
static void
select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt)
{
- int vlen1, vlen2;
+ int v1, v2;
+
+ v1 = now->len;
+ v2 = alt->len;
- vlen1 = now->len * (now->ignore_case ? 1 : 2);
- vlen2 = alt->len * (alt->ignore_case ? 1 : 2);
+ if (v1 <= 2 && v2 <= 2) {
+ /* ByteValTable[x] is big value --> low price */
+ v2 = map_position_value(now->s[0]);
+ v1 = map_position_value(alt->s[0]);
- if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0)
+ if (now->len > 1) v1 += 5;
+ if (alt->len > 1) v2 += 5;
+ }
+
+ if (now->ignore_case == 0) v1 *= 2;
+ if (alt->ignore_case == 0) v2 *= 2;
+
+ if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0)
copy_opt_exact_info(now, alt);
}
@@ -3661,7 +3720,7 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from)
}
static void
-add_char_opt_map_info(OptMapInfo* map, int c)
+add_char_opt_map_info(OptMapInfo* map, UChar c)
{
if (map->map[c] == 0) {
map->map[c] = 1;
@@ -3669,26 +3728,48 @@ add_char_opt_map_info(OptMapInfo* map, int c)
}
}
-static void
-add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc)
+static int
+add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end,
+ OnigEncoding enc, OnigAmbigType ambig_flag)
{
- UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ int i, j, n, len;
+ UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN];
+ OnigCodePoint code, ccode;
+ OnigCompAmbigCodes* ccs;
+ OnigPairAmbigCodes* pccs;
+ OnigAmbigType amb;
- add_char_opt_map_info(map, c);
+ add_char_opt_map_info(map, p[0]);
+ code = ONIGENC_MBC_TO_CODE(enc, p, end);
- x = (UChar )c;
- ONIGENC_MBC_TO_LOWER(enc, &x, low);
- if (*low != x) {
- add_char_opt_map_info(map, (int )(*low));
- }
- else {
- int i;
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
- x = (UChar )i;
- ONIGENC_MBC_TO_LOWER(enc, &x, low);
- if ((int )(*low) == c) add_char_opt_map_info(map, i);
+ for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
+ if ((amb & ambig_flag) == 0) continue;
+
+ n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs);
+ for (i = 0; i < n; i++) {
+ if (pccs[i].from == code) {
+ len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf);
+ if (len < 0) return len;
+ add_char_opt_map_info(map, buf[0]);
+ }
+ }
+
+ if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) {
+ n = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs);
+ for (i = 0; i < n; i++) {
+ if (ccs[i].code == code) {
+ for (j = 0; j < ccs[i].n; j++) {
+ ccode = ccs[i].items[j].code[0];
+ len = ONIGENC_CODE_TO_MBC(enc, ccode, buf);
+ if (len < 0) return len;
+ add_char_opt_map_info(map, buf[0]);
+ }
+ break;
+ }
+ }
}
}
+ return 0;
}
static void
@@ -3894,143 +3975,110 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
case N_STRING:
{
- UChar *p;
- int len, plen;
StrNode* sn = &(NSTRING(node));
int slen = sn->end - sn->s;
int is_raw = NSTRING_IS_RAW(node);
- if ((! IS_IGNORECASE(env->options)) || is_raw) {
+ if (! NSTRING_IS_AMBIG(node)) {
concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
NSTRING_IS_RAW(node), env->enc);
if (slen > 0) {
add_char_opt_map_info(&opt->map, *(sn->s));
}
+ set_mml(&opt->len, slen, slen);
}
else {
- for (p = sn->s; p < sn->end; ) {
- len = enc_len(env->enc, *p);
- if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) {
- break;
- }
- p += len;
- }
+ int n, max;
- plen = p - sn->s;
- if (plen > slen / 5) {
- concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc);
- concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc);
- opt->exm.ignore_case = 1;
- if (opt->exm.len == sn->end - p)
- opt->exm.reach_end = 1;
-
- copy_mml(&(opt->exm.mmd), &(opt->exb.mmd));
- add_len_mml(&(opt->exm.mmd), plen);
- }
- else {
- concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
- is_raw, env->enc);
- opt->exb.ignore_case = 1;
- }
+ concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
+ is_raw, env->enc);
+ opt->exb.ignore_case = 1;
if (slen > 0) {
- if (p == sn->s)
- add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc);
- else
- add_char_opt_map_info(&opt->map, *(sn->s));
+ r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end,
+ env->enc, env->ambig_flag);
+ if (r != 0) break;
}
+
+ if (NSTRING_IS_AMBIG_REDUCE(node)) {
+ n = onigenc_strlen(env->enc, sn->s, sn->end);
+ max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n;
+ }
+ else {
+ max = slen;
+ }
+ set_mml(&opt->len, slen, max);
}
if (opt->exb.len == slen)
opt->exb.reach_end = 1;
-
- set_mml(&opt->len, slen, slen);
}
break;
case N_CCLASS:
{
- int i, z, len, found, mb_found;
+ int i, z;
CClassNode* cc = &(NCCLASS(node));
/* no need to check ignore case. (setted in setup_tree()) */
- found = mb_found = 0;
- for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
- z = BITSET_AT(cc->bs, i);
- if ((z && !cc->not) || (!z && cc->not)) {
- found = 1;
- add_char_opt_map_info(&opt->map, i);
- }
- }
- if (! ONIGENC_IS_SINGLEBYTE(env->enc)) {
- if (! IS_NULL(cc->mbuf) ||
- (cc->not != 0 && found != 0)) {
- for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
- z = ONIGENC_IS_MBC_HEAD(env->enc, i);
- if (z) {
- mb_found = 1;
- add_char_opt_map_info(&opt->map, i);
- }
- }
- }
- }
+ if (IS_NOT_NULL(cc->mbuf) || cc->not != 0) {
+ OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
+ OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- if (mb_found) {
- len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- set_mml(&opt->len, 1, len);
+ set_mml(&opt->len, min, max);
}
- else if (found) {
- len = 1;
- set_mml(&opt->len, 1, len);
+ else {
+ for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
+ z = BITSET_AT(cc->bs, i);
+ if ((z && !cc->not) || (!z && cc->not)) {
+ add_char_opt_map_info(&opt->map, (UChar )i);
+ }
+ }
+ set_mml(&opt->len, 1, 1);
}
}
break;
case N_CTYPE:
{
- int c;
- int len, min, max;
+ int i, min, max;
- min = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- max = 0;
+ max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
-#define IS_WORD_HEAD_BYTE(enc,b) \
- (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \
- : ONIGENC_IS_MBC_HEAD(enc,b))
+ if (max == 1) {
+ min = 1;
- switch (NCTYPE(node).type) {
- case CTYPE_WORD:
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (IS_WORD_HEAD_BYTE(env->enc, c)) {
- add_char_opt_map_info(&opt->map, c);
- len = enc_len(env->enc, c);
- if (len < min) min = len;
- if (len > max) max = len;
- }
- }
- break;
+ switch (NCTYPE(node).type) {
+ case CTYPE_NOT_WORD:
+ for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
+ if (! ONIGENC_IS_CODE_WORD(env->enc, i)) {
+ add_char_opt_map_info(&opt->map, (UChar )i);
+ }
+ }
+ break;
- case CTYPE_NOT_WORD:
- for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (! IS_WORD_HEAD_BYTE(env->enc, c)) {
- add_char_opt_map_info(&opt->map, c);
- len = enc_len(env->enc, c);
- if (len < min) min = len;
- if (len > max) max = len;
- }
+ case CTYPE_WORD:
+ for (i = 0; i < SINGLE_BYTE_SIZE; i++) {
+ if (ONIGENC_IS_CODE_WORD(env->enc, i)) {
+ add_char_opt_map_info(&opt->map, (UChar )i);
+ }
+ }
+ break;
}
- break;
}
-
+ else {
+ min = ONIGENC_MBC_MINLEN(env->enc);
+ }
set_mml(&opt->len, min, max);
}
break;
case N_ANYCHAR:
{
- OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc);
- set_mml(&opt->len, 1, len);
+ OnigDistance min = ONIGENC_MBC_MINLEN(env->enc);
+ OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc);
+ set_mml(&opt->len, min, max);
}
break;
@@ -4231,36 +4279,20 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
if (e->len == 0) return 0;
- reg->exact = onig_strdup(e->s, e->s + e->len);
- CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
-
- reg->exact_end = reg->exact + e->len;
-
if (e->ignore_case) {
- UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN];
- int len, low_len, i, j, alloc_size;
-
- alloc_size = e->len;
- i = j = 0;
- while (i < e->len) {
- low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf);
- len = enc_len(reg->enc, e->s[i]);
- if (low_len > alloc_size - i) {
- reg->exact = xrealloc(reg->exact, alloc_size * 2);
- CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
- alloc_size *= 2;
- }
-
- xmemcpy(&(reg->exact[j]), buf, low_len);
- i += len;
- j += low_len;
- }
- reg->exact_end = reg->exact + j;
+ reg->exact = (UChar* )xmalloc(e->len);
+ CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
+ xmemcpy(reg->exact, e->s, e->len);
+ reg->exact_end = reg->exact + e->len;
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
}
else {
int allow_reverse;
+ reg->exact = onig_strdup(e->s, e->s + e->len);
+ CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY);
+ reg->exact_end = reg->exact + e->len;
+
if (e->anc.left_anchor & ANCHOR_BEGIN_LINE)
allow_reverse = 1;
else
@@ -4268,7 +4300,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end);
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
- r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0,
+ r = set_bm_skip(reg->exact, reg->exact_end, reg->enc,
reg->map, &(reg->int_map));
if (r) return r;
@@ -4328,6 +4360,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env)
env.enc = reg->enc;
env.options = reg->options;
+ env.ambig_flag = reg->ambig_flag;
env.scan_env = scan_env;
clear_mml(&env.mmd);
@@ -4482,17 +4515,26 @@ print_optimize_info(FILE* f, regex_t* reg)
fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact));
}
else if (reg->optimize & ONIG_OPTIMIZE_MAP) {
- int i, n = 0;
+ int c, i, n = 0;
+
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
if (reg->map[i]) n++;
fprintf(f, "map: n=%d\n", n);
if (n > 0) {
+ c = 0;
fputc('[', f);
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
- if (reg->map[i] && enc_len(reg->enc, i) == 1 &&
- ONIGENC_IS_CODE_PRINT(reg->enc, i))
- fputc(i, f);
+ for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) {
+ if (reg->map[i] != 0) {
+ if (c > 0) fputs(", ", f);
+ c++;
+ if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 &&
+ ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i))
+ fputc(i, f);
+ else
+ fprintf(f, "%d", i);
+ }
+ }
fprintf(f, "]\n");
}
}
@@ -4531,7 +4573,7 @@ onig_free(regex_t* reg)
xfree(from);\
} while (0)
-static void
+extern void
onig_transfer(regex_t* to, regex_t* from)
{
THREAD_ATOMIC_START;
@@ -4545,7 +4587,7 @@ onig_transfer(regex_t* to, regex_t* from)
}\
} while (0)
-static void
+extern void
onig_chain_link_add(regex_t* to, regex_t* add)
{
THREAD_ATOMIC_START;
@@ -4598,7 +4640,8 @@ onig_clone(regex_t** to, regex_t* from)
from->state++; /* increment as search counter */
}
- r = onig_alloc_init(&reg, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT);
+ r = onig_alloc_init(&reg, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
+ from->enc, ONIG_SYNTAX_DEFAULT);
if (r != 0) {
from->state--;
return r;
@@ -4829,8 +4872,8 @@ onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end,
static int onig_inited = 0;
extern int
-onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc,
- OnigSyntaxType* syntax)
+onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag,
+ OnigEncoding enc, OnigSyntaxType* syntax)
{
if (! onig_inited)
onig_init();
@@ -4863,6 +4906,9 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc,
(*reg)->used = 0;
(*reg)->name_table = (void* )NULL;
+ (*reg)->ambig_flag = ambig_flag;
+ (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc);
+
return 0;
}
@@ -4875,7 +4921,8 @@ onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end,
if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL;
- r = onig_alloc_init(reg, option, enc, syntax);
+ r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT,
+ enc, syntax);
if (r) return r;
r = onig_compile(*reg, pattern, pattern_end, einfo);
@@ -4971,7 +5018,7 @@ OnigOpInfoType OnigOpInfo[] = {
{ OP_BACKREF2, "backref2", ARG_NON },
{ OP_BACKREF3, "backref3", ARG_NON },
{ OP_BACKREFN, "backrefn", ARG_MEMNUM },
- { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM },
+ { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL },
{ OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL },
{ OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL },
{ OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM },
@@ -4992,6 +5039,8 @@ OnigOpInfoType OnigOpInfo[] = {
{ OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL },
{ OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM },
{ OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM },
+ { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM },
+ { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM },
{ OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM },
{ OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM },
{ OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM },
@@ -5058,7 +5107,8 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s)
}
extern void
-onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp)
+onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp,
+ OnigEncoding enc)
{
int i, n, arg_type;
RelAddrType addr;
@@ -5150,7 +5200,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp)
break;
case OP_EXACT1_IC:
- p_string(f, 1, bp++);
+ len = enc_len(enc, bp);
+ p_string(f, len, bp);
+ bp += len;
break;
case OP_EXACTN_IC:
GET_LENGTH_INC(len, bp);
@@ -5196,8 +5248,14 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp)
fprintf(f, ":%d:%d:%d", n, (int )code, len);
break;
- case OP_BACKREF_MULTI:
+ case OP_BACKREFN_IC:
+ mem = *((MemNumType* )bp);
+ bp += SIZE_MEMNUM;
+ fprintf(f, ":%d", mem);
+ break;
+
case OP_BACKREF_MULTI_IC:
+ case OP_BACKREF_MULTI:
fputs(" ", f);
GET_LENGTH_INC(len, bp);
for (i = 0; i < len; i++) {
@@ -5265,7 +5323,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg)
else
fputs(" ", f);
}
- onig_print_compiled_byte_code(f, bp, &bp);
+ onig_print_compiled_byte_code(f, bp, &bp, reg->enc);
}
fprintf(f, "\n");
@@ -5325,12 +5383,6 @@ print_indent_tree(FILE* f, Node* node, int indent)
fprintf(f, "%0x", bbuf->p[i]);
}
}
-#if 0
- fprintf(f, "\n");
- Indent(f, indent);
- for (i = 0; i < SINGLE_BYTE_SIZE; i++)
- fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f);
-#endif
break;
case N_CTYPE:
diff --git a/regexec.c b/regexec.c
index 5d759032e8..d5dba2124c 100644
--- a/regexec.c
+++ b/regexec.c
@@ -1,53 +1,152 @@
/**********************************************************************
-
regexec.c - Oniguruma (regular expression library)
-
- Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
#include "regint.h"
+#ifdef USE_CAPTURE_HISTORY
+static void history_tree_free(OnigCaptureTreeNode* node);
+
static void
-region_list_clear(OnigRegion** list)
+history_tree_clear(OnigCaptureTreeNode* node)
{
int i;
- if (IS_NOT_NULL(list)) {
- for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
- if (IS_NOT_NULL(list[i])) {
- xfree(list[i]);
- list[i] = (OnigRegion* )0;
+ if (IS_NOT_NULL(node)) {
+ for (i = 0; i < node->num_childs; i++) {
+ if (IS_NOT_NULL(node->childs[i])) {
+ history_tree_free(node->childs[i]);
}
}
+ for (i = 0; i < node->allocated; i++) {
+ node->childs[i] = (OnigCaptureTreeNode* )0;
+ }
+ node->num_childs = 0;
+ node->beg = ONIG_REGION_NOTPOS;
+ node->end = ONIG_REGION_NOTPOS;
+ node->group = -1;
}
}
static void
-region_list_free(OnigRegion* r)
+history_tree_free(OnigCaptureTreeNode* node)
{
- if (IS_NOT_NULL(r->list)) {
- region_list_clear(r->list);
- xfree(r->list);
- r->list = (OnigRegion** )0;
+ history_tree_clear(node);
+ xfree(node);
+}
+
+static void
+history_root_free(OnigRegion* r)
+{
+ if (IS_NOT_NULL(r->history_root)) {
+ history_tree_free(r->history_root);
+ r->history_root = (OnigCaptureTreeNode* )0;
}
}
-static OnigRegion**
-region_list_new()
+static OnigCaptureTreeNode*
+history_node_new()
{
- int i;
- OnigRegion** list;
+ OnigCaptureTreeNode* node;
+
+ node = (OnigCaptureTreeNode* )xmalloc(sizeof(OnigCaptureTreeNode));
+ CHECK_NULL_RETURN(node);
+ node->childs = (OnigCaptureTreeNode** )0;
+ node->allocated = 0;
+ node->num_childs = 0;
+ node->group = -1;
+ node->beg = ONIG_REGION_NOTPOS;
+ node->end = ONIG_REGION_NOTPOS;
+
+ return node;
+}
- list = (OnigRegion** )xmalloc(sizeof(OnigRegion*)
- * (ONIG_MAX_CAPTURE_HISTORY_GROUP + 1));
- CHECK_NULL_RETURN(list);
- for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
- list[i] = (OnigRegion* )0;
+static int
+history_tree_add_child(OnigCaptureTreeNode* parent, OnigCaptureTreeNode* child)
+{
+#define HISTORY_TREE_INIT_ALLOC_SIZE 8
+
+ if (parent->num_childs >= parent->allocated) {
+ int n, i;
+
+ if (IS_NULL(parent->childs)) {
+ n = HISTORY_TREE_INIT_ALLOC_SIZE;
+ parent->childs =
+ (OnigCaptureTreeNode** )xmalloc(sizeof(OnigCaptureTreeNode*) * n);
+ }
+ else {
+ n = parent->allocated * 2;
+ parent->childs =
+ (OnigCaptureTreeNode** )xrealloc(parent->childs,
+ sizeof(OnigCaptureTreeNode*) * n);
+ }
+ CHECK_NULL_RETURN_VAL(parent->childs, ONIGERR_MEMORY);
+ for (i = parent->allocated; i < n; i++) {
+ parent->childs[i] = (OnigCaptureTreeNode* )0;
+ }
+ parent->allocated = n;
+ }
+
+ parent->childs[parent->num_childs] = child;
+ parent->num_childs++;
+ return 0;
+}
+
+static OnigCaptureTreeNode*
+history_tree_clone(OnigCaptureTreeNode* node)
+{
+ int i;
+ OnigCaptureTreeNode *clone, *child;
+
+ clone = history_node_new();
+ CHECK_NULL_RETURN(clone);
+
+ clone->beg = node->beg;
+ clone->end = node->end;
+ for (i = 0; i < node->num_childs; i++) {
+ child = history_tree_clone(node->childs[i]);
+ if (IS_NULL(child)) {
+ history_tree_free(clone);
+ return (OnigCaptureTreeNode* )0;
+ }
+ history_tree_add_child(clone, child);
}
- return list;
+ return clone;
}
+extern OnigCaptureTreeNode*
+onig_get_capture_tree(OnigRegion* region)
+{
+ return region->history_root;
+}
+#endif /* USE_CAPTURE_HISTORY */
+
extern void
onig_region_clear(OnigRegion* region)
{
@@ -56,7 +155,9 @@ onig_region_clear(OnigRegion* region)
for (i = 0; i < region->num_regs; i++) {
region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
}
- region_list_clear(region->list);
+#ifdef USE_CAPTURE_HISTORY
+ history_root_free(region);
+#endif
}
extern int
@@ -92,88 +193,20 @@ onig_region_resize(OnigRegion* region, int n)
region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
}
- if (IS_NOT_NULL(region->list))
- region_list_clear(region->list);
-
- return 0;
-}
-
-static int
-region_ensure_size(OnigRegion* region, int n)
-{
- int i, new_size;
-
- if (region->allocated >= n)
- return 0;
-
- new_size = region->allocated;
- if (new_size == 0)
- new_size = ONIG_NREGION;
- while (new_size < n)
- new_size *= 2;
-
- if (region->allocated == 0) {
- region->beg = (int* )xmalloc(new_size * sizeof(int));
- region->end = (int* )xmalloc(new_size * sizeof(int));
- if (region->beg == 0 || region->end == 0)
- return ONIGERR_MEMORY;
-
- region->allocated = new_size;
- }
- else if (region->allocated < new_size) {
- region->beg = (int* )xrealloc(region->beg, new_size * sizeof(int));
- region->end = (int* )xrealloc(region->end, new_size * sizeof(int));
- if (region->beg == 0 || region->end == 0)
- return ONIGERR_MEMORY;
-
- region->allocated = new_size;
- }
-
- for (i = region->num_regs; i < n; i++) {
- region->beg[i] = region->end[i] = ONIG_REGION_NOTPOS;
- }
- return 0;
-}
-
-static int
-region_list_add_entry(OnigRegion* region, int group, int start, int end)
-{
- int r, pos;
- OnigRegion** list;
-
- if (group > ONIG_MAX_CAPTURE_HISTORY_GROUP)
- return ONIGERR_GROUP_NUMBER_OVER_FOR_CAPTURE_HISTORY;
-
- if (IS_NULL(region->list)) {
- region->list = region_list_new();
- CHECK_NULL_RETURN_VAL(region->list, ONIGERR_MEMORY);
- }
-
- list = region->list;
- if (IS_NULL(list[group])) {
- list[group] = onig_region_new();
- CHECK_NULL_RETURN_VAL(list[group], ONIGERR_MEMORY);
- }
-
- r = region_ensure_size(list[group], list[group]->num_regs + 1);
- if (r != 0) return r;
-
- pos = list[group]->num_regs;
- list[group]->beg[pos] = start;
- list[group]->end[pos] = end;
- list[group]->num_regs++;
-
+#ifdef USE_CAPTURE_HISTORY
+ history_root_free(region);
+#endif
return 0;
}
static void
onig_region_init(OnigRegion* region)
{
- region->num_regs = 0;
- region->allocated = 0;
- region->beg = (int* )0;
- region->end = (int* )0;
- region->list = (OnigRegion** )0;
+ region->num_regs = 0;
+ region->allocated = 0;
+ region->beg = (int* )0;
+ region->end = (int* )0;
+ region->history_root = (OnigCaptureTreeNode* )0;
}
extern OnigRegion*
@@ -195,7 +228,9 @@ onig_region_free(OnigRegion* r, int free_self)
if (r->end) xfree(r->end);
r->allocated = 0;
}
- region_list_free(r);
+#ifdef USE_CAPTURE_HISTORY
+ history_root_free(r);
+#endif
if (free_self) xfree(r);
}
}
@@ -227,28 +262,13 @@ onig_region_copy(OnigRegion* to, OnigRegion* from)
}
to->num_regs = from->num_regs;
- if (IS_NOT_NULL(from->list)) {
- if (IS_NULL(to->list)) {
- to->list = region_list_new();
- }
+#ifdef USE_CAPTURE_HISTORY
+ history_root_free(to);
- for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
- if (IS_NOT_NULL(from->list[i])) {
- if (IS_NULL(to->list[i]))
- to->list[i] = onig_region_new();
-
- onig_region_copy(to->list[i], from->list[i]);
- }
- else {
- if (IS_NOT_NULL(to->list[i])) {
- xfree(to->list[i]);
- to->list[i] = (OnigRegion* )0;
- }
- }
- }
+ if (IS_NOT_NULL(from->history_root)) {
+ to->history_root = history_tree_clone(from->history_root);
}
- else
- region_list_free(to);
+#endif
}
@@ -851,24 +871,25 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end,
}\
} while(0)
-#define STRING_CMP_IC(s1,ps2,len) do {\
- if (string_cmp_ic(encode, s1, ps2, len) == 0) \
+#define STRING_CMP_IC(ambig_flag,s1,ps2,len) do {\
+ if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \
goto fail; \
} while(0)
-static int string_cmp_ic(OnigEncoding enc,
+static int string_cmp_ic(OnigEncoding enc, int ambig_flag,
UChar* s1, UChar** ps2, int mblen)
{
- UChar buf1[ONIGENC_MBC_TO_LOWER_MAXLEN];
- UChar buf2[ONIGENC_MBC_TO_LOWER_MAXLEN];
- UChar *p1, *p2, *end, *s2;
+ UChar buf1[ONIGENC_MBC_NORMALIZE_MAXLEN];
+ UChar buf2[ONIGENC_MBC_NORMALIZE_MAXLEN];
+ UChar *p1, *p2, *end, *s2, *end2;
int len1, len2;
- s2 = *ps2;
- end = s1 + mblen;
+ s2 = *ps2;
+ end = s1 + mblen;
+ end2 = s2 + mblen;
while (s1 < end) {
- len1 = ONIGENC_MBC_TO_LOWER(enc, s1, buf1);
- len2 = ONIGENC_MBC_TO_LOWER(enc, s2, buf2);
+ len1 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s1, end, buf1);
+ len2 = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s2, end2, buf2);
if (len1 != len2) return 0;
p1 = buf1;
p2 = buf2;
@@ -877,9 +898,6 @@ static int string_cmp_ic(OnigEncoding enc,
p1++;
p2++;
}
-
- s1 += enc_len(enc, *s1);
- s2 += enc_len(enc, *s2);
}
*ps2 = s2;
@@ -895,8 +913,8 @@ static int string_cmp_ic(OnigEncoding enc,
}\
} while(0)
-#define STRING_CMP_VALUE_IC(s1,ps2,len,is_fail) do {\
- if (string_cmp_ic(encode, s1, ps2, len) == 0) \
+#define STRING_CMP_VALUE_IC(ambig_flag,s1,ps2,len,is_fail) do {\
+ if (string_cmp_ic(encode, ambig_flag, s1, ps2, len) == 0) \
is_fail = 1; \
else \
is_fail = 0; \
@@ -911,6 +929,110 @@ static int string_cmp_ic(OnigEncoding enc,
#define DATA_ENSURE_CHECK(n) (s + (n) <= end)
+#ifdef USE_CAPTURE_HISTORY
+static int
+make_capture_history_tree(OnigCaptureTreeNode* node, StackType** kp,
+ StackType* stk_top, UChar* str, regex_t* reg)
+{
+ int n, r;
+ OnigCaptureTreeNode* child;
+ StackType* k = *kp;
+
+ while (k < stk_top) {
+ if (k->type == STK_MEM_START) {
+ n = k->u.mem.num;
+ if (n <= ONIG_MAX_CAPTURE_HISTORY_GROUP &&
+ BIT_STATUS_AT(reg->capture_history, n) != 0) {
+ child = history_node_new();
+ CHECK_NULL_RETURN_VAL(child, ONIGERR_MEMORY);
+ child->group = n;
+ child->beg = (int )(k->u.mem.pstr - str);
+ r = history_tree_add_child(node, child);
+ if (r != 0) return r;
+ *kp = (k + 1);
+ r = make_capture_history_tree(child, kp, stk_top, str, reg);
+ if (r != 0) return r;
+
+ k = *kp;
+ child->end = (int )(k->u.mem.pstr - str);
+ }
+ }
+ else if (k->type == STK_MEM_END) {
+ if (k->u.mem.num == node->group) {
+ node->end = (int )(k->u.mem.pstr - str);
+ *kp = k;
+ return 0;
+ }
+ }
+ k++;
+ }
+
+ return 1; /* 1: root node ending. */
+}
+#endif
+
+#ifdef RUBY_PLATFORM
+
+typedef struct {
+ int state;
+ regex_t* reg;
+ MatchArg* msa;
+ StackType* stk_base;
+} TrapEnsureArg;
+
+static VALUE
+trap_ensure(VALUE arg)
+{
+ TrapEnsureArg* ta = (TrapEnsureArg* )arg;
+
+ if (ta->state == 0) { /* trap_exec() is not normal return */
+ ta->reg->state--;
+ if (! IS_NULL(ta->msa->stack_p) && ta->stk_base != ta->msa->stack_p)
+ xfree(ta->stk_base);
+
+ MATCH_ARG_FREE(*(ta->msa));
+ }
+
+ return Qnil;
+}
+
+static VALUE
+trap_exec(VALUE arg)
+{
+ TrapEnsureArg* ta;
+
+ rb_trap_exec();
+
+ ta = (TrapEnsureArg* )arg;
+ ta->state = 1; /* normal return */
+ return Qnil;
+}
+
+extern void
+onig_exec_trap(regex_t* reg, MatchArg* msa, StackType* stk_base)
+{
+ VALUE arg;
+ TrapEnsureArg ta;
+
+ ta.state = 0;
+ ta.reg = reg;
+ ta.msa = msa;
+ ta.stk_base = stk_base;
+ arg = (VALUE )(&ta);
+ rb_ensure(trap_exec, arg, trap_ensure, arg);
+}
+
+#define CHECK_INTERRUPT_IN_MATCH_AT do {\
+ if (rb_trap_pending) {\
+ if (! rb_prohibit_interrupt) {\
+ onig_exec_trap(reg, msa, stk_base);\
+ }\
+ }\
+} while (0)
+#else
+#define CHECK_INTERRUPT_IN_MATCH_AT
+#endif /* RUBY_PLATFORM */
+
#ifdef ONIG_DEBUG_STATISTICS
#define USE_TIMEOFDAY
@@ -955,6 +1077,7 @@ static int MaxStackDepth = 0;
} while (0)
#ifdef RUBY_PLATFORM
+
/*
* :nodoc:
*/
@@ -1047,7 +1170,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
RelAddrType addr;
OnigOptionType option = reg->options;
OnigEncoding encode = reg->enc;
- int ignore_case;
+ OnigAmbigType ambig_flag = reg->ambig_flag;
UChar *s, *q, *sbegin;
UChar *p = reg->p;
char *alloca_base;
@@ -1059,7 +1182,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
n = reg->num_repeat + reg->num_mem * 2;
STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE);
- ignore_case = IS_IGNORECASE(option);
pop_level = reg->stack_pop_level;
num_mem = reg->num_mem;
repeat_stk = (StackIndex* )alloca_base;
@@ -1092,7 +1214,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
fprintf(stderr, "%4d> \"", (int )(s - str));
bp = buf;
for (i = 0, q = s; i < 7 && q < end; i++) {
- len = enc_len(encode, *q);
+ len = enc_len(encode, q);
while (len-- > 0) *bp++ = *q++;
}
if (q < end) { xmemcpy(bp, "...\"", 4); bp += 4; }
@@ -1100,7 +1222,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
*bp = 0;
fputs(buf, stderr);
for (i = 0; i < 20 - (bp - buf); i++) fputc(' ', stderr);
- onig_print_compiled_byte_code(stderr, p, NULL);
+ onig_print_compiled_byte_code(stderr, p, NULL, encode);
fprintf(stderr, "\n");
}
#endif
@@ -1155,27 +1277,33 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
}
}
+#ifdef USE_CAPTURE_HISTORY
if (reg->capture_history != 0) {
- UChar *pstart, *pend;
- for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
- if (BIT_STATUS_AT(reg->capture_history, i) != 0) {
- stkp = stk_base;
- do {
- STACK_GET_MEM_RANGE(stkp, i, pstart, pend);
- if (stkp < stk) {
- int r;
- r = region_list_add_entry(region, i,
- pstart - str, pend - str);
- if (r) {
- STACK_SAVE;
- return r;
- }
- }
- stkp++;
- } while (stkp < stk);
- }
- }
- } /* list of captures */
+ int r;
+ OnigCaptureTreeNode* node;
+
+ if (IS_NULL(region->history_root)) {
+ region->history_root = node = history_node_new();
+ CHECK_NULL_RETURN_VAL(node, ONIGERR_MEMORY);
+ }
+ else {
+ node = region->history_root;
+ history_tree_clear(node);
+ }
+
+ node->group = 0;
+ node->beg = sstart - str;
+ node->end = s - str;
+
+ stkp = stk_base;
+ r = make_capture_history_tree(region->history_root, &stkp,
+ stk, str, reg);
+ if (r < 0) {
+ best_len = r; /* error code */
+ goto finish;
+ }
+ }
+#endif /* USE_CAPTURE_HISTORY */
#ifdef USE_POSIX_REGION_OPTION
} /* else IS_POSIX_REGION() */
#endif
@@ -1212,12 +1340,12 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_EXACT1_IC: STAT_OP_IN(OP_EXACT1_IC);
{
int len;
- UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN];
- len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf);
- DATA_ENSURE(len);
+ DATA_ENSURE(1);
+ len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf);
+ DATA_ENSURE(0);
q = lowbuf;
- s += enc_len(encode, *s);
while (len-- > 0) {
if (*p != *q) goto fail;
p++; q++;
@@ -1296,16 +1424,16 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_EXACTN_IC: STAT_OP_IN(OP_EXACTN_IC);
{
int len;
- UChar *q, *endp, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ UChar *q, *endp, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN];
GET_LENGTH_INC(tlen, p);
endp = p + tlen;
while (p < endp) {
- len = ONIGENC_MBC_TO_LOWER(encode, s, lowbuf);
- DATA_ENSURE(len);
sprev = s;
- s += enc_len(encode, *s);
+ DATA_ENSURE(1);
+ len = ONIGENC_MBC_TO_NORMALIZE(encode, ambig_flag, &s, end, lowbuf);
+ DATA_ENSURE(0);
q = lowbuf;
while (len-- > 0) {
if (*p != *q) goto fail;
@@ -1409,20 +1537,22 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
DATA_ENSURE(1);
if (BITSET_AT(((BitSetRef )p), *s) == 0) goto fail;
p += SIZE_BITSET;
- s += enc_len(encode, *s); /* OP_CCLASS can match mb-code. \D, \S */
+ s += enc_len(encode, s); /* OP_CCLASS can match mb-code. \D, \S */
STAT_OP_OUT;
break;
case OP_CCLASS_MB: STAT_OP_IN(OP_CCLASS_MB);
- if (! ONIGENC_IS_MBC_HEAD(encode, *s)) goto fail;
+ if (! ONIGENC_IS_MBC_HEAD(encode, s)) goto fail;
cclass_mb:
GET_LENGTH_INC(tlen, p);
{
OnigCodePoint code;
UChar *ss;
- int mb_len = enc_len(encode, *s);
+ int mb_len;
+ DATA_ENSURE(1);
+ mb_len = enc_len(encode, s);
DATA_ENSURE(mb_len);
ss = s;
s += mb_len;
@@ -1442,7 +1572,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_CCLASS_MIX: STAT_OP_IN(OP_CCLASS_MIX);
DATA_ENSURE(1);
- if (ONIGENC_IS_MBC_HEAD(encode, *s)) {
+ if (ONIGENC_IS_MBC_HEAD(encode, s)) {
p += SIZE_BITSET;
goto cclass_mb;
}
@@ -1462,13 +1592,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
DATA_ENSURE(1);
if (BITSET_AT(((BitSetRef )p), *s) != 0) goto fail;
p += SIZE_BITSET;
- s += enc_len(encode, *s);
+ s += enc_len(encode, s);
STAT_OP_OUT;
break;
case OP_CCLASS_MB_NOT: STAT_OP_IN(OP_CCLASS_MB_NOT);
- if (! ONIGENC_IS_MBC_HEAD(encode, *s)) {
- DATA_ENSURE(1);
+ DATA_ENSURE(1);
+ if (! ONIGENC_IS_MBC_HEAD(encode, s)) {
s++;
GET_LENGTH_INC(tlen, p);
p += tlen;
@@ -1480,7 +1610,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
{
OnigCodePoint code;
UChar *ss;
- int mb_len = enc_len(encode, *s);
+ int mb_len = enc_len(encode, s);
if (s + mb_len > end) {
DATA_ENSURE(1);
@@ -1509,7 +1639,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_CCLASS_MIX_NOT: STAT_OP_IN(OP_CCLASS_MIX_NOT);
DATA_ENSURE(1);
- if (ONIGENC_IS_MBC_HEAD(encode, *s)) {
+ if (ONIGENC_IS_MBC_HEAD(encode, s)) {
p += SIZE_BITSET;
goto cclass_mb_not;
}
@@ -1526,21 +1656,17 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
break;
case OP_ANYCHAR: STAT_OP_IN(OP_ANYCHAR);
- n = enc_len(encode, *s);
- if (n > 1) {
- DATA_ENSURE(n);
- s += n;
- }
- else {
- DATA_ENSURE(1);
- if (ONIG_IS_NEWLINE(*s)) goto fail;
- s++;
- }
+ DATA_ENSURE(1);
+ n = enc_len(encode, s);
+ DATA_ENSURE(n);
+ if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
+ s += n;
STAT_OP_OUT;
break;
case OP_ANYCHAR_ML: STAT_OP_IN(OP_ANYCHAR_ML);
- n = enc_len(encode, *s);
+ DATA_ENSURE(1);
+ n = enc_len(encode, s);
DATA_ENSURE(n);
s += n;
STAT_OP_OUT;
@@ -1549,17 +1675,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_ANYCHAR_STAR: STAT_OP_IN(OP_ANYCHAR_STAR);
while (s < end) {
STACK_PUSH_ALT(p, s, sprev);
- n = enc_len(encode, *s);
- if (n > 1) {
- DATA_ENSURE(n);
- sprev = s;
- s += n;
- }
- else {
- if (ONIG_IS_NEWLINE(*s)) goto fail;
- sprev = s;
- s++;
- }
+ n = enc_len(encode, s);
+ DATA_ENSURE(n);
+ if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
+ sprev = s;
+ s += n;
}
STAT_OP_OUT;
break;
@@ -1567,7 +1687,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_ANYCHAR_ML_STAR: STAT_OP_IN(OP_ANYCHAR_ML_STAR);
while (s < end) {
STACK_PUSH_ALT(p, s, sprev);
- n = enc_len(encode, *s);
+ n = enc_len(encode, s);
if (n > 1) {
DATA_ENSURE(n);
sprev = s;
@@ -1586,17 +1706,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
if (*p == *s) {
STACK_PUSH_ALT(p + 1, s, sprev);
}
- n = enc_len(encode, *s);
- if (n > 1) {
- DATA_ENSURE(n);
- sprev = s;
- s += n;
- }
- else {
- if (ONIG_IS_NEWLINE(*s)) goto fail;
- sprev = s;
- s++;
- }
+ n = enc_len(encode, s);
+ DATA_ENSURE(n);
+ if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail;
+ sprev = s;
+ s += n;
}
p++;
STAT_OP_OUT;
@@ -1607,7 +1721,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
if (*p == *s) {
STACK_PUSH_ALT(p + 1, s, sprev);
}
- n = enc_len(encode, *s);
+ n = enc_len(encode, s);
if (n >1) {
DATA_ENSURE(n);
sprev = s;
@@ -1627,7 +1741,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
if (! ONIGENC_IS_MBC_WORD(encode, s, end))
goto fail;
- s += enc_len(encode, *s);
+ s += enc_len(encode, s);
STAT_OP_OUT;
break;
@@ -1636,7 +1750,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
if (ONIGENC_IS_MBC_WORD(encode, s, end))
goto fail;
- s += enc_len(encode, *s);
+ s += enc_len(encode, s);
STAT_OP_OUT;
break;
@@ -1719,7 +1833,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
STAT_OP_OUT;
continue;
}
- else if (ONIG_IS_NEWLINE(*sprev) && !ON_STR_END(s)) {
+ else if (ONIGENC_IS_MBC_NEWLINE(encode, sprev, end) && !ON_STR_END(s)) {
STAT_OP_OUT;
continue;
}
@@ -1729,7 +1843,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_END_LINE: STAT_OP_IN(OP_END_LINE);
if (ON_STR_END(s)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) {
+ if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
if (IS_NOTEOL(msa->options)) goto fail;
STAT_OP_OUT;
@@ -1738,7 +1852,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
}
#endif
}
- else if (ONIG_IS_NEWLINE(*s)) {
+ else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) {
STAT_OP_OUT;
continue;
}
@@ -1748,7 +1862,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_SEMI_END_BUF: STAT_OP_IN(OP_SEMI_END_BUF);
if (ON_STR_END(s)) {
#ifndef USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE
- if (IS_EMPTY_STR || !ONIG_IS_NEWLINE(*sprev)) {
+ if (IS_EMPTY_STR || !ONIGENC_IS_MBC_NEWLINE(encode, sprev, end)) {
#endif
if (IS_NOTEOL(msa->options)) goto fail; /* Is it needed? */
STAT_OP_OUT;
@@ -1757,7 +1871,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
}
#endif
}
- if (ONIG_IS_NEWLINE(*s) && ON_STR_END(s+1)) {
+ else if (ONIGENC_IS_MBC_NEWLINE(encode, s, end) &&
+ ON_STR_END(s + enc_len(encode, s))) {
STAT_OP_OUT;
continue;
}
@@ -1866,7 +1981,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
DATA_ENSURE(n);
sprev = s;
STRING_CMP(pstart, s, n);
- while (sprev + (len = enc_len(encode, *sprev)) < s)
+ while (sprev + (len = enc_len(encode, sprev)) < s)
sprev += len;
STAT_OP_OUT;
@@ -1897,8 +2012,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
n = pend - pstart;
DATA_ENSURE(n);
sprev = s;
- STRING_CMP_IC(pstart, &s, n);
- while (sprev + (len = enc_len(encode, *sprev)) < s)
+ STRING_CMP_IC(ambig_flag, pstart, &s, n);
+ while (sprev + (len = enc_len(encode, sprev)) < s)
sprev += len;
STAT_OP_OUT;
@@ -1933,7 +2048,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
STRING_CMP_VALUE(pstart, swork, n, is_fail);
if (is_fail) continue;
s = swork;
- while (sprev + (len = enc_len(encode, *sprev)) < s)
+ while (sprev + (len = enc_len(encode, sprev)) < s)
sprev += len;
p += (SIZE_MEMNUM * (tlen - i - 1));
@@ -1969,10 +2084,10 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
DATA_ENSURE(n);
sprev = s;
swork = s;
- STRING_CMP_VALUE_IC(pstart, &swork, n, is_fail);
+ STRING_CMP_VALUE_IC(ambig_flag, pstart, &swork, n, is_fail);
if (is_fail) continue;
s = swork;
- while (sprev + (len = enc_len(encode, *sprev)) < s)
+ while (sprev + (len = enc_len(encode, sprev)) < s)
sprev += len;
p += (SIZE_MEMNUM * (tlen - i - 1));
@@ -1986,7 +2101,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_SET_OPTION_PUSH: STAT_OP_IN(OP_SET_OPTION_PUSH);
GET_OPTION_INC(option, p);
- ignore_case = IS_IGNORECASE(option);
STACK_PUSH_ALT(p, s, sprev);
p += SIZE_OP_SET_OPTION + SIZE_OP_FAIL;
STAT_OP_OUT;
@@ -1995,7 +2109,6 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
case OP_SET_OPTION: STAT_OP_IN(OP_SET_OPTION);
GET_OPTION_INC(option, p);
- ignore_case = IS_IGNORECASE(option);
STAT_OP_OUT;
continue;
break;
@@ -2027,6 +2140,8 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
+ case OP_REPEAT_INC_SG:
+ case OP_REPEAT_INC_NG_SG:
p += SIZE_MEMNUM;
break;
default:
@@ -2093,6 +2208,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
GET_RELADDR_INC(addr, p);
p += addr;
STAT_OP_OUT;
+ CHECK_INTERRUPT_IN_MATCH_AT;
continue;
break;
@@ -2182,13 +2298,14 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
}
else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) {
STACK_PUSH_ALT(p, s, sprev);
- p = stkp->u.repeat.pcode;
+ p = STACK_AT(si)->u.repeat.pcode; /* Don't use stkp after PUSH. */
}
else {
p = stkp->u.repeat.pcode;
}
STACK_PUSH_REPEAT_INC(si);
STAT_OP_OUT;
+ CHECK_INTERRUPT_IN_MATCH_AT;
continue;
break;
@@ -2206,11 +2323,11 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
repeat_inc_ng:
stkp->u.repeat.count++;
- if (stkp->u.repeat.count < reg->repeat_range[mem].upper
- || reg->repeat_range[mem].upper < 0 /* IS_REPEAT_INFINITE(upper) */) {
+ if (stkp->u.repeat.count < reg->repeat_range[mem].upper ||
+ IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) {
if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) {
UChar* pcode = stkp->u.repeat.pcode;
-
+
STACK_PUSH_REPEAT_INC(si);
STACK_PUSH_ALT(pcode, s, sprev);
}
@@ -2223,6 +2340,7 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
STACK_PUSH_REPEAT_INC(si);
}
STAT_OP_OUT;
+ CHECK_INTERRUPT_IN_MATCH_AT;
continue;
break;
@@ -2233,6 +2351,13 @@ match_at(regex_t* reg, UChar* str, UChar* end, UChar* sstart,
goto repeat_inc_ng;
break;
+ case OP_REPEAT_INC_NG_SG: STAT_OP_IN(OP_REPEAT_INC_NG_SG);
+ GET_MEMNUM_INC(mem, p); /* mem: OP_REPEAT ID */
+ STACK_GET_REPEAT(mem, stkp);
+ si = GET_STACK_INDEX(stkp);
+ goto repeat_inc_ng;
+ break;
+
case OP_PUSH_POS: STAT_OP_IN(OP_PUSH_POS);
STACK_PUSH_POS(s, sprev);
STAT_OP_OUT;
@@ -2390,73 +2515,39 @@ slow_search(OnigEncoding enc, UChar* target, UChar* target_end,
if (t == target_end)
return s;
}
- s += enc_len(enc, *s);
+ s += enc_len(enc, s);
}
return (UChar* )NULL;
}
-#if 0
-static int
-str_trans_match_after_head_byte(OnigEncoding enc,
- int len, UChar* t, UChar* tend, UChar* p)
-{
- while (--len > 0) {
- if (*t != *p) break;
- t++; p++;
- }
-
- if (len == 0) {
- int lowlen;
- UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
-
- while (t < tend) {
- len = enc_len(enc, *p);
- lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf);
- q = lowbuf;
- while (lowlen > 0) {
- if (*t++ != *q++) break;
- lowlen--;
- }
- if (lowlen > 0) break;
- p += len;
- }
- if (t == tend)
- return 1;
- }
-
- return 0;
-}
-#endif
-
static int
-str_lower_case_match(OnigEncoding enc, UChar* t, UChar* tend, UChar* p)
+str_lower_case_match(OnigEncoding enc, int ambig_flag,
+ UChar* t, UChar* tend, UChar* p, UChar* end)
{
- int len, lowlen;
- UChar *q, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ int lowlen;
+ UChar *q, lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN];
while (t < tend) {
- len = enc_len(enc, *p);
- lowlen = ONIGENC_MBC_TO_LOWER(enc, p, lowbuf);
+ lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &p, end, lowbuf);
q = lowbuf;
while (lowlen > 0) {
if (*t++ != *q++) return 0;
lowlen--;
}
- p += len;
}
return 1;
}
static UChar*
-slow_search_ic(OnigEncoding enc,
+slow_search_ic(OnigEncoding enc, int ambig_flag,
UChar* target, UChar* target_end,
UChar* text, UChar* text_end, UChar* text_range)
{
- int len, lowlen;
- UChar *t, *p, *s, *end;
- UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ int lowlen;
+ UChar *t, *p, *s, *end, *z;
+ UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN];
end = text_end - (target_end - target) + 1;
if (end > text_range)
@@ -2465,22 +2556,21 @@ slow_search_ic(OnigEncoding enc,
s = text;
while (s < end) {
- len = enc_len(enc, *s);
- lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf);
+ z = s;
+ lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf);
if (*target == *lowbuf) {
p = lowbuf + 1;
t = target + 1;
while (--lowlen > 0) {
if (*p != *t) break;
- p++; *t++;
+ p++; t++;
}
if (lowlen == 0) {
- if (str_lower_case_match(enc, t, target_end, s + len))
- return s;
+ if (str_lower_case_match(enc, ambig_flag,
+ t, target_end, s, text_end))
+ return z;
}
}
-
- s += len;
}
return (UChar* )NULL;
@@ -2517,14 +2607,14 @@ slow_search_backward(OnigEncoding enc, UChar* target, UChar* target_end,
}
static UChar*
-slow_search_backward_ic(OnigEncoding enc,
+slow_search_backward_ic(OnigEncoding enc, int ambig_flag,
UChar* target,UChar* target_end,
UChar* text, UChar* adjust_text,
UChar* text_end, UChar* text_start)
{
int len, lowlen;
- UChar *t, *p, *s;
- UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
+ UChar *t, *p, *s, *z;
+ UChar lowbuf[ONIGENC_MBC_NORMALIZE_MAXLEN];
s = text_end - (target_end - target);
if (s > text_start)
@@ -2533,22 +2623,24 @@ slow_search_backward_ic(OnigEncoding enc,
s = ONIGENC_LEFT_ADJUST_CHAR_HEAD(enc, adjust_text, s);
while (s >= text) {
- len = enc_len(enc, *s);
- lowlen = ONIGENC_MBC_TO_LOWER(enc, s, lowbuf);
+ len = enc_len(enc, s);
+ z = s;
+ lowlen = ONIGENC_MBC_TO_NORMALIZE(enc, ambig_flag, &s, text_end, lowbuf);
if (*target == *lowbuf) {
p = lowbuf + 1;
t = target + 1;
while (--lowlen > 0) {
if (*p != *t) break;
- p++; *t++;
+ p++; t++;
}
if (lowlen == 0) {
- if (str_lower_case_match(enc, t, target_end, s + len))
- return s;
+ if (str_lower_case_match(enc, ambig_flag,
+ t, target_end, s, text_end))
+ return z;
}
}
- s = onigenc_get_prev_char_head(enc, adjust_text, s);
+ s = onigenc_get_prev_char_head(enc, adjust_text, z);
}
return (UChar* )NULL;
@@ -2562,6 +2654,11 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end,
UChar *tail;
int skip;
+#ifdef ONIG_DEBUG_SEARCH
+ fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n",
+ (int )text, (int )text_end, (int )text_range);
+#endif
+
end = text_range + (target_end - target) - 1;
if (end > text_end)
end = text_end;
@@ -2569,7 +2666,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end,
tail = target_end - 1;
s = text;
while ((s - text) < target_end - target) {
- s += enc_len(reg->enc, *s);
+ s += enc_len(reg->enc, s);
}
s--; /* set to text check tail position. */
@@ -2587,7 +2684,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end,
if (p >= text_end) return (UChar* )NULL;
t = p;
do {
- p += enc_len(reg->enc, *p);
+ p += enc_len(reg->enc, p);
} while ((p - t) < skip && p < text_end);
s += (p - t);
@@ -2607,7 +2704,7 @@ bm_search_notrev(regex_t* reg, UChar* target, UChar* target_end,
if (p >= text_end) return (UChar* )NULL;
t = p;
do {
- p += enc_len(reg->enc, *p);
+ p += enc_len(reg->enc, p);
} while ((p - t) < skip && p < text_end);
s += (p - t);
@@ -2655,11 +2752,10 @@ bm_search(regex_t* reg, UChar* target, UChar* target_end,
}
static int
-set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc,
- int ignore_case, int** skip)
+set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc, int** skip)
+
{
int i, len;
- UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN];
if (IS_NULL(*skip)) {
*skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
@@ -2670,16 +2766,9 @@ set_bm_backward_skip(UChar* s, UChar* end, OnigEncoding enc,
for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
(*skip)[i] = len;
- if (ignore_case) {
- for (i = len - 1; i > 0; i--) {
- ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf);
- (*skip)[*lowbuf] = i;
- }
- }
- else {
- for (i = len - 1; i > 0; i--)
- (*skip)[s[i]] = i;
- }
+ for (i = len - 1; i > 0; i--)
+ (*skip)[s[i]] = i;
+
return 0;
}
@@ -2719,7 +2808,7 @@ map_search(OnigEncoding enc, UChar map[], UChar* text, UChar* text_range)
while (s < text_range) {
if (map[*s]) return s;
- s += enc_len(enc, *s);
+ s += enc_len(enc, s);
}
return (UChar* )NULL;
}
@@ -2746,6 +2835,23 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region,
UChar *prev;
MatchArg msa;
+ if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) {
+ reg->state++; /* increment as search counter */
+ if (IS_NOT_NULL(reg->chain)) {
+ onig_chain_reduce(reg);
+ reg->state++;
+ }
+ }
+ else {
+ int n = 0;
+ while (ONIG_STATE(reg) < ONIG_STATE_NORMAL) {
+ if (++n > THREAD_PASS_LIMIT_COUNT)
+ return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT;
+ THREAD_PASS;
+ }
+ reg->state++; /* increment as search counter */
+ }
+
MATCH_ARG_INIT(msa, option, region, at);
if (region
@@ -2762,7 +2868,9 @@ onig_match(regex_t* reg, UChar* str, UChar* end, UChar* at, OnigRegion* region,
prev = onigenc_get_prev_char_head(reg->enc, str, at);
r = match_at(reg, str, end, at, prev, &msa);
}
+
MATCH_ARG_FREE(msa);
+ reg->state--; /* decrement as search counter */
return r;
}
@@ -2784,7 +2892,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
}
else {
UChar *q = p + reg->dmin;
- while (p < q) p += enc_len(reg->enc, *p);
+ while (p < q) p += enc_len(reg->enc, p);
}
}
@@ -2794,7 +2902,8 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
p = slow_search(reg->enc, reg->exact, reg->exact_end, p, end, range);
break;
case ONIG_OPTIMIZE_EXACT_IC:
- p = slow_search_ic(reg->enc, reg->exact, reg->exact_end, p, end, range);
+ p = slow_search_ic(reg->enc, reg->ambig_flag,
+ reg->exact, reg->exact_end, p, end, range);
break;
case ONIG_OPTIMIZE_EXACT_BM:
@@ -2814,7 +2923,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
if (p - reg->dmin < s) {
retry_gate:
pprev = p;
- p += enc_len(reg->enc, *p);
+ p += enc_len(reg->enc, p);
goto retry;
}
@@ -2826,7 +2935,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
if (!ON_STR_BEGIN(p)) {
prev = onigenc_get_prev_char_head(reg->enc,
(pprev ? pprev : str), p);
- if (!ONIG_IS_NEWLINE(*prev))
+ if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
goto retry_gate;
}
break;
@@ -2835,10 +2944,10 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
if (ON_STR_END(p)) {
prev = onigenc_get_prev_char_head(reg->enc,
(pprev ? pprev : str), p);
- if (prev && ONIG_IS_NEWLINE(*prev))
+ if (prev && ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end))
goto retry_gate;
}
- else if (!ONIG_IS_NEWLINE(*p))
+ else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end))
goto retry_gate;
break;
}
@@ -2886,7 +2995,7 @@ forward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
}
static int set_bm_backward_skip P_((UChar* s, UChar* end, OnigEncoding enc,
- int ignore_case, int** skip));
+ int** skip));
#define BM_BACKWARD_SEARCH_LENGTH_THRESHOLD 100
@@ -2909,8 +3018,9 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
break;
case ONIG_OPTIMIZE_EXACT_IC:
- p = slow_search_backward_ic(reg->enc, reg->exact,
- reg->exact_end, range, adjrange, end, p);
+ p = slow_search_backward_ic(reg->enc, reg->ambig_flag,
+ reg->exact, reg->exact_end,
+ range, adjrange, end, p);
break;
case ONIG_OPTIMIZE_EXACT_BM:
@@ -2919,7 +3029,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
if (s - range < BM_BACKWARD_SEARCH_LENGTH_THRESHOLD)
goto exact_method;
- r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc, 0,
+ r = set_bm_backward_skip(reg->exact, reg->exact_end, reg->enc,
&(reg->int_map_backward));
if (r) return r;
}
@@ -2940,7 +3050,7 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
case ANCHOR_BEGIN_LINE:
if (!ON_STR_BEGIN(p)) {
prev = onigenc_get_prev_char_head(reg->enc, adjrange, p);
- if (!ONIG_IS_NEWLINE(*prev)) {
+ if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
p = prev;
goto retry;
}
@@ -2951,12 +3061,12 @@ backward_search_range(regex_t* reg, UChar* str, UChar* end, UChar* s,
if (ON_STR_END(p)) {
prev = onigenc_get_prev_char_head(reg->enc, adjrange, p);
if (IS_NULL(prev)) goto fail;
- if (ONIG_IS_NEWLINE(*prev)) {
+ if (ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end)) {
p = prev;
goto retry;
}
}
- else if (!ONIG_IS_NEWLINE(*p)) {
+ else if (!ONIGENC_IS_MBC_NEWLINE(reg->enc, p, end)) {
p = onigenc_get_prev_char_head(reg->enc, adjrange, p);
if (IS_NULL(p)) goto fail;
goto retry;
@@ -3096,8 +3206,10 @@ onig_search(regex_t* reg, UChar* str, UChar* end,
}
}
else if (reg->anchor & ANCHOR_SEMI_END_BUF) {
- if (ONIG_IS_NEWLINE(end[-1])) {
- semi_end = end - 1;
+ UChar* pre_end = ONIGENC_STEP_BACK(reg->enc, start, end, 1);
+
+ if (ONIGENC_IS_MBC_NEWLINE(reg->enc, pre_end, end)) {
+ semi_end = pre_end;
if (semi_end > str && start <= semi_end) {
goto end_buf;
}
@@ -3167,13 +3279,14 @@ onig_search(regex_t* reg, UChar* str, UChar* end,
while (s <= high) {
MATCH_AND_RETURN_CHECK;
prev = s;
- s += enc_len(reg->enc, *s);
+ s += enc_len(reg->enc, s);
}
if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) {
if (IS_NOT_NULL(prev)) {
- while (!ONIG_IS_NEWLINE(*prev) && s < range) {
+ while (!ONIGENC_IS_MBC_NEWLINE(reg->enc, prev, end) &&
+ s < range) {
prev = s;
- s += enc_len(reg->enc, *s);
+ s += enc_len(reg->enc, s);
}
}
}
@@ -3190,14 +3303,18 @@ onig_search(regex_t* reg, UChar* str, UChar* end,
do {
MATCH_AND_RETURN_CHECK;
prev = s;
- s += enc_len(reg->enc, *s);
+ s += enc_len(reg->enc, s);
} while (s <= range); /* exec s == range, because empty match with /$/. */
}
else { /* backward search */
if (reg->optimize != ONIG_OPTIMIZE_NONE) {
UChar *low, *high, *adjrange, *sch_start;
- adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range);
+ if (range < end)
+ adjrange = ONIGENC_LEFT_ADJUST_CHAR_HEAD(reg->enc, str, range);
+ else
+ adjrange = end;
+
if (reg->dmax != ONIG_INFINITE_DISTANCE &&
(end - range) >= reg->threshold_len) {
do {
@@ -3296,8 +3413,44 @@ onig_get_options(regex_t* reg)
return reg->options;
}
+extern OnigAmbigType
+onig_get_ambig_flag(regex_t* reg)
+{
+ return reg->ambig_flag;
+}
+
extern OnigSyntaxType*
onig_get_syntax(regex_t* reg)
{
return reg->syntax;
}
+
+extern int
+onig_number_of_captures(regex_t* reg)
+{
+ return reg->num_mem;
+}
+
+extern int
+onig_number_of_capture_histories(regex_t* reg)
+{
+#ifdef USE_CAPTURE_HISTORY
+ int i, n;
+
+ n = 0;
+ for (i = 0; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) {
+ if (BIT_STATUS_AT(reg->capture_history, i) != 0)
+ n++;
+ }
+ return n;
+#else
+ return 0;
+#endif
+}
+
+extern void
+onig_copy_encoding(OnigEncoding to, OnigEncoding from)
+{
+ *to = *from;
+}
+
diff --git a/regint.h b/regint.h
index ccbc359301..f0977ff1f3 100644
--- a/regint.h
+++ b/regint.h
@@ -1,12 +1,33 @@
+#ifndef REGINT_H
+#define REGINT_H
/**********************************************************************
-
regint.h - Oniguruma (regular expression library)
-
- Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
-#ifndef REGINT_H
-#define REGINT_H
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
/* for debug */
/* #define ONIG_DEBUG_PARSE_TREE */
@@ -19,7 +40,8 @@
/* #define ONIG_DEBUG_STATISTICS */
#if defined(ONIG_DEBUG_PARSE_TREE) || defined(ONIG_DEBUG_MATCH) || \
- defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_STATISTICS)
+ defined(ONIG_DEBUG_SEARCH) || defined(ONIG_DEBUG_COMPILE) || \
+ defined(ONIG_DEBUG_STATISTICS)
#ifndef ONIG_DEBUG
#define ONIG_DEBUG
#endif
@@ -36,7 +58,6 @@
/* spec. config */
#define USE_NAMED_GROUP
#define USE_SUBEXP_CALL
-#define USE_FOLD_MATCH /* ess-tsett etc... */
#define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */
#define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */
#define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR
@@ -51,12 +72,14 @@
/* interface to external system */
#ifdef NOT_RUBY /* gived from Makefile */
#include "config.h"
+#define USE_CAPTURE_HISTORY
#define USE_VARIABLE_META_CHARS
#define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */
#define USE_POSIX_REGION_OPTION /* needed for POSIX API support */
#define THREAD_ATOMIC_START /* depend on thread system */
#define THREAD_ATOMIC_END /* depend on thread system */
#define THREAD_PASS /* depend on thread system */
+#define CHECK_INTERRUPT /* depend on application */
#define xmalloc malloc
#define xrealloc realloc
#define xfree free
@@ -67,6 +90,14 @@
#define THREAD_ATOMIC_START DEFER_INTS
#define THREAD_ATOMIC_END ENABLE_INTS
#define THREAD_PASS rb_thread_schedule()
+#define CHECK_INTERRUPT do {\
+ if (rb_trap_pending) {\
+ if (! rb_prohibit_interrupt) {\
+ rb_trap_exec();\
+ }\
+ }\
+} while (0)
+
#define DEFAULT_WARN_FUNCTION rb_warn
#define DEFAULT_VERB_WARN_FUNCTION rb_warning
@@ -108,7 +139,9 @@
#endif
#include <ctype.h>
+#ifndef __BORLANDC__
#include <sys/types.h>
+#endif
#ifdef ONIG_DEBUG
# include <stdio.h>
@@ -291,6 +324,8 @@ typedef unsigned int BitStatusType;
/* ignore-case and multibyte status are included in compiled code. */
#define IS_DYNAMIC_OPTION(option) 0
+#define REPEAT_INFINITE -1
+#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE)
/* bitset */
#define BITS_PER_BYTE 8
@@ -530,11 +565,11 @@ enum OpCode {
#define ARG_MEMNUM 4
#define ARG_OPTION 5
-typedef short int RelAddrType;
-typedef short int AbsAddrType;
-typedef short int LengthType;
-typedef short int MemNumType;
-typedef int RepeatNumType;
+typedef int RelAddrType;
+typedef int AbsAddrType;
+typedef int LengthType;
+typedef int RepeatNumType;
+typedef short int MemNumType;
#define SIZE_OPCODE 1
#define SIZE_RELADDR sizeof(RelAddrType)
@@ -575,6 +610,7 @@ typedef int RepeatNumType;
option = *((OnigOptionType* )(p));\
(p) += SIZE_OPTION;\
} while(0)
+
#else
#define GET_RELADDR_INC(addr,p) GET_SHORT_INC(addr,p)
@@ -637,23 +673,37 @@ typedef int RepeatNumType;
#define SIZE_OP_RETURN SIZE_OPCODE
-typedef struct {
- UChar esc;
- UChar anychar;
- UChar anytime;
- UChar zero_or_one_time;
- UChar one_or_more_time;
- UChar anychar_anytime;
-} OnigMetaCharTableType;
-
-extern OnigMetaCharTableType OnigMetaCharTable;
-
-#define MC_ESC OnigMetaCharTable.esc
-#define MC_ANYCHAR OnigMetaCharTable.anychar
-#define MC_ANYTIME OnigMetaCharTable.anytime
-#define MC_ZERO_OR_ONE_TIME OnigMetaCharTable.zero_or_one_time
-#define MC_ONE_OR_MORE_TIME OnigMetaCharTable.one_or_more_time
-#define MC_ANYCHAR_ANYTIME OnigMetaCharTable.anychar_anytime
+#define MC_ESC(enc) (enc)->meta_char_table.esc
+#define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar
+#define MC_ANYTIME(enc) (enc)->meta_char_table.anytime
+#define MC_ZERO_OR_ONE_TIME(enc) (enc)->meta_char_table.zero_or_one_time
+#define MC_ONE_OR_MORE_TIME(enc) (enc)->meta_char_table.one_or_more_time
+#define MC_ANYCHAR_ANYTIME(enc) (enc)->meta_char_table.anychar_anytime
+
+#define SYN_POSIX_COMMON_OP \
+ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \
+ ONIG_SYN_OP_DECIMAL_BACKREF | \
+ ONIG_SYN_OP_BRACKET_CC | ONIG_SYN_OP_ASTERISK_ZERO_INF | \
+ ONIG_SYN_OP_LINE_ANCHOR | \
+ ONIG_SYN_OP_ESC_CONTROL_CHARS )
+
+#define SYN_GNU_REGEX_OP \
+ ( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_BRACKET_CC | \
+ ONIG_SYN_OP_POSIX_BRACKET | ONIG_SYN_OP_DECIMAL_BACKREF | \
+ ONIG_SYN_OP_BRACE_INTERVAL | ONIG_SYN_OP_LPAREN_SUBEXP | \
+ ONIG_SYN_OP_VBAR_ALT | \
+ ONIG_SYN_OP_ASTERISK_ZERO_INF | ONIG_SYN_OP_PLUS_ONE_INF | \
+ ONIG_SYN_OP_QMARK_ZERO_ONE | \
+ ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR | ONIG_SYN_OP_ESC_CAPITAL_G_BEGIN_ANCHOR | \
+ ONIG_SYN_OP_ESC_W_WORD | \
+ ONIG_SYN_OP_ESC_B_WORD_BOUND | ONIG_SYN_OP_ESC_LTGT_WORD_BEGIN_END | \
+ ONIG_SYN_OP_ESC_S_WHITE_SPACE | ONIG_SYN_OP_ESC_D_DIGIT | \
+ ONIG_SYN_OP_LINE_ANCHOR )
+
+#define SYN_GNU_REGEX_BV \
+ ( ONIG_SYN_CONTEXT_INDEP_ANCHORS | ONIG_SYN_CONTEXT_INDEP_REPEAT_OPS | \
+ ONIG_SYN_CONTEXT_INVALID_REPEAT_OPS | ONIG_SYN_ALLOW_INVALID_INTERVAL | \
+ ONIG_SYN_BACKSLASH_ESCAPE_IN_CC | ONIG_SYN_ALLOW_DOUBLE_RANGE_OP_IN_CC )
#define SYN_POSIX_COMMON_OP \
( ONIG_SYN_OP_DOT_ANYCHAR | ONIG_SYN_OP_POSIX_BRACKET | \
@@ -691,7 +741,7 @@ typedef struct {
extern OnigOpInfoType OnigOpInfo[];
-extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp));
+extern void onig_print_compiled_byte_code P_((FILE* f, UChar* bp, UChar** nextp, OnigEncoding enc));
#ifdef ONIG_DEBUG_STATISTICS
extern void onig_statistics_init P_((void));
@@ -703,9 +753,11 @@ extern char* onig_error_code_to_format P_((int code));
extern void onig_snprintf_with_pattern PV_((char buf[], int bufsize, OnigEncoding enc, char* pat, char* pat_end, char *fmt, ...));
extern UChar* onig_strdup P_((UChar* s, UChar* end));
extern int onig_bbuf_init P_((BBuf* buf, int size));
-extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax));
+extern int onig_alloc_init P_((regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, OnigEncoding enc, OnigSyntaxType* syntax));
extern int onig_compile P_((regex_t* reg, UChar* pattern, UChar* pattern_end, OnigErrorInfo* einfo));
extern void onig_chain_reduce P_((regex_t* reg));
+extern void onig_chain_link_add P_((regex_t* to, regex_t* add));
+extern void onig_transfer P_((regex_t* to, regex_t* from));
extern int onig_is_in_code_range P_((UChar* p, OnigCodePoint code));
#endif /* REGINT_H */
diff --git a/regparse.c b/regparse.c
index 67bcbec5eb..b75c6951d0 100644
--- a/regparse.c
+++ b/regparse.c
@@ -1,10 +1,32 @@
/**********************************************************************
-
regparse.c - Oniguruma (regular expression library)
-
- Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
#include "regparse.h"
#define WARN_BUFSIZE 256
@@ -21,12 +43,14 @@ OnigSyntaxType OnigSyntaxRuby = {
ONIG_SYN_OP2_ESC_G_SUBEXP_CALL |
ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT |
ONIG_SYN_OP2_CCLASS_SET_OP | ONIG_SYN_OP2_ESC_CAPITAL_C_BAR_CONTROL |
- ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB )
+ ONIG_SYN_OP2_ESC_CAPITAL_M_BAR_META | ONIG_SYN_OP2_ESC_V_VTAB |
+ ONIG_SYN_OP2_ESC_H_XDIGIT )
, ( SYN_GNU_REGEX_BV |
ONIG_SYN_ALLOW_INTERVAL_LOW_ABBREV |
ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND |
ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP |
ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME |
+ ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY |
ONIG_SYN_WARN_CC_OP_NOT_ESCAPED |
ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT )
, ONIG_OPTION_NONE
@@ -34,15 +58,6 @@ OnigSyntaxType OnigSyntaxRuby = {
OnigSyntaxType* OnigDefaultSyntax = ONIG_SYNTAX_RUBY;
-OnigMetaCharTableType OnigMetaCharTable = {
- (OnigCodePoint )'\\' /* esc */
- , (OnigCodePoint )0 /* anychar '.' */
- , (OnigCodePoint )0 /* anytime '*' */
- , (OnigCodePoint )0 /* zero or one time '?' */
- , (OnigCodePoint )0 /* one or more time '+' */
- , (OnigCodePoint )0 /* anychar anytime */
-};
-
extern void onig_null_warn(char* s) { }
#ifdef DEFAULT_WARN_FUNCTION
@@ -93,12 +108,15 @@ bbuf_clone(BBuf** rto, BBuf* from)
#define ONOFF(v,f,negative) (negative) ? ((v) &= ~(f)) : ((v) |= (f))
-#define SET_ALL_MULTI_BYTE_RANGE(pbuf) \
- add_code_range_to_buf(pbuf, (OnigCodePoint )0x80, ~((OnigCodePoint )0))
+#define MBCODE_START_POS(enc) \
+ (OnigCodePoint )(ONIGENC_MBC_MINLEN(enc) > 1 ? 0 : 0x80)
-#define ADD_ALL_MULTI_BYTE_RANGE(code, mbuf) do {\
- if (! ONIGENC_IS_SINGLEBYTE(code)) {\
- r = SET_ALL_MULTI_BYTE_RANGE(&(mbuf));\
+#define SET_ALL_MULTI_BYTE_RANGE(enc, pbuf) \
+ add_code_range_to_buf(pbuf, MBCODE_START_POS(enc), ~((OnigCodePoint )0))
+
+#define ADD_ALL_MULTI_BYTE_RANGE(enc, mbuf) do {\
+ if (! ONIGENC_IS_SINGLEBYTE(enc)) {\
+ r = SET_ALL_MULTI_BYTE_RANGE(enc, &(mbuf));\
if (r) return r;\
}\
} while (0)
@@ -217,14 +235,23 @@ onig_strdup(UChar* s, UChar* end)
}
/* scan pattern methods */
-#define PEND_VALUE -1
-
-#define PFETCH(c) do { (c) = *p++; } while (0)
-#define PUNFETCH p--
-#define PINC p++
-#define PPEEK (p < end ? *p : PEND_VALUE)
-#define PEND (p < end ? 0 : 1)
+#define PEND_VALUE 0
+
+#define PFETCH_READY UChar* pfetch_prev
+#define PEND (p < end ? 0 : 1)
+#define PUNFETCH p = pfetch_prev
+#define PINC do { \
+ pfetch_prev = p; \
+ p += ONIGENC_MBC_ENC_LEN(enc, p); \
+} while (0)
+#define PFETCH(c) do { \
+ c = ONIGENC_MBC_TO_CODE(enc, p, end); \
+ pfetch_prev = p; \
+ p += ONIGENC_MBC_ENC_LEN(enc, p); \
+} while (0)
+#define PPEEK (p < end ? ONIGENC_MBC_TO_CODE(enc, p, end) : PEND_VALUE)
+#define PPEEK_IS(c) (PPEEK == (OnigCodePoint )c)
static UChar*
k_strcat_capa(UChar* dest, UChar* dest_end, UChar* src, UChar* src_end,
@@ -388,12 +415,15 @@ typedef struct {
regex_t* reg;
void* arg;
int ret;
+ OnigEncoding enc;
} INamesArg;
static int
i_names(UChar* key, NameEntry* e, INamesArg* arg)
{
- int r = (*(arg->func))(e->name, e->name + strlen(e->name), e->back_num,
+ int r = (*(arg->func))(e->name,
+ e->name + onigenc_str_bytelen_null(arg->enc, e->name),
+ e->back_num,
(e->back_num > 1 ? e->back_refs : &(e->back_ref1)),
arg->reg, arg->arg);
if (r != 0) {
@@ -416,6 +446,7 @@ onig_foreach_name(regex_t* reg,
narg.func = func;
narg.reg = reg;
narg.arg = arg;
+ narg.enc = reg->enc; /* should be pattern encoding. */
st_foreach(t, i_names, (HashDataType )&narg);
}
return narg.ret;
@@ -973,6 +1004,12 @@ node_new_list(Node* left, Node* right)
return node;
}
+extern Node*
+onig_node_new_list(Node* left, Node* right)
+{
+ return node_new_list(left, right);
+}
+
static Node*
node_new_alt(Node* left, Node* right)
{
@@ -1172,6 +1209,20 @@ onig_node_conv_to_str_node(Node* node, int flag)
NSTRING(node).end = NSTRING(node).buf;
}
+extern void
+onig_node_str_clear(Node* node)
+{
+ if (NSTRING(node).capa != 0 &&
+ IS_NOT_NULL(NSTRING(node).s) && NSTRING(node).s != NSTRING(node).buf) {
+ xfree(NSTRING(node).s);
+ }
+
+ NSTRING(node).capa = 0;
+ NSTRING(node).flag = 0;
+ NSTRING(node).s = NSTRING(node).buf;
+ NSTRING(node).end = NSTRING(node).buf;
+}
+
static Node*
node_new_str(UChar* s, UChar* end)
{
@@ -1190,6 +1241,12 @@ node_new_str(UChar* s, UChar* end)
return node;
}
+extern Node*
+onig_node_new_str(UChar* s, UChar* end)
+{
+ return node_new_str(s, end);
+}
+
static Node*
node_new_str_raw(UChar* s, UChar* end)
{
@@ -1205,15 +1262,6 @@ node_new_empty()
}
static Node*
-node_new_str_char(UChar c)
-{
- UChar p[1];
-
- p[0] = c;
- return node_new_str(p, p + 1);
-}
-
-static Node*
node_new_str_raw_char(UChar c)
{
UChar p[1];
@@ -1244,7 +1292,7 @@ static int
str_node_can_be_split(StrNode* sn, OnigEncoding enc)
{
if (sn->end > sn->s) {
- return ((enc_len(enc, *(sn->s)) < sn->end - sn->s) ? 1 : 0);
+ return ((enc_len(enc, sn->s) < sn->end - sn->s) ? 1 : 0);
}
return 0;
}
@@ -1253,8 +1301,9 @@ extern int
onig_scan_unsigned_number(UChar** src, UChar* end, OnigEncoding enc)
{
unsigned int num, val;
- int c;
+ OnigCodePoint c;
UChar* p = *src;
+ PFETCH_READY;
num = 0;
while (!PEND) {
@@ -1279,9 +1328,10 @@ static int
scan_unsigned_hexadecimal_number(UChar** src, UChar* end, int maxlen,
OnigEncoding enc)
{
- int c;
+ OnigCodePoint c;
unsigned int num, val;
UChar* p = *src;
+ PFETCH_READY;
num = 0;
while (!PEND && maxlen-- != 0) {
@@ -1306,9 +1356,10 @@ static int
scan_unsigned_octal_number(UChar** src, UChar* end, int maxlen,
OnigEncoding enc)
{
- int c;
+ OnigCodePoint c;
unsigned int num, val;
UChar* p = *src;
+ PFETCH_READY;
num = 0;
while (!PEND && maxlen-- != 0) {
@@ -1444,15 +1495,15 @@ add_code_range(BBuf** pbuf, ScanEnv* env, OnigCodePoint from, OnigCodePoint to)
}
static int
-not_code_range_buf(BBuf* bbuf, BBuf** pbuf)
+not_code_range_buf(OnigEncoding enc, BBuf* bbuf, BBuf** pbuf)
{
int r, i, n;
- OnigCodePoint pre, from, to, *data;
+ OnigCodePoint pre, from, *data, to = 0;
*pbuf = (BBuf* )NULL;
if (IS_NULL(bbuf)) {
set_all:
- return SET_ALL_MULTI_BYTE_RANGE(pbuf);
+ return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
}
data = (OnigCodePoint* )(bbuf->p);
@@ -1461,7 +1512,7 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf)
if (n <= 0) goto set_all;
r = 0;
- pre = 0x80;
+ pre = MBCODE_START_POS(enc);
for (i = 0; i < n; i++) {
from = data[i*2];
to = data[i*2+1];
@@ -1486,7 +1537,8 @@ not_code_range_buf(BBuf* bbuf, BBuf** pbuf)
} while (0)
static int
-or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
+or_code_range_buf(OnigEncoding enc, BBuf* bbuf1, int not1,
+ BBuf* bbuf2, int not2, BBuf** pbuf)
{
int r;
OnigCodePoint i, n1, *data1;
@@ -1495,7 +1547,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
*pbuf = (BBuf* )NULL;
if (IS_NULL(bbuf1) && IS_NULL(bbuf2)) {
if (not1 != 0 || not2 != 0)
- return SET_ALL_MULTI_BYTE_RANGE(pbuf);
+ return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
return 0;
}
@@ -1505,14 +1557,14 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
if (IS_NULL(bbuf1)) {
if (not1 != 0) {
- return SET_ALL_MULTI_BYTE_RANGE(pbuf);
+ return SET_ALL_MULTI_BYTE_RANGE(enc, pbuf);
}
else {
if (not2 == 0) {
return bbuf_clone(pbuf, bbuf2);
}
else {
- return not_code_range_buf(bbuf2, pbuf);
+ return not_code_range_buf(enc, bbuf2, pbuf);
}
}
}
@@ -1528,7 +1580,7 @@ or_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
r = bbuf_clone(pbuf, bbuf2);
}
else if (not1 == 0) { /* 1 OR (not 2) */
- r = not_code_range_buf(bbuf2, pbuf);
+ r = not_code_range_buf(enc, bbuf2, pbuf);
}
if (r != 0) return r;
@@ -1639,6 +1691,29 @@ and_code_range_buf(BBuf* bbuf1, int not1, BBuf* bbuf2, int not2, BBuf** pbuf)
}
static int
+clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
+{
+ BBuf *tbuf;
+ int r;
+
+ if (cc->not != 0) {
+ bitset_invert(cc->bs);
+
+ if (! ONIGENC_IS_SINGLEBYTE(enc)) {
+ r = not_code_range_buf(enc, cc->mbuf, &tbuf);
+ if (r != 0) return r;
+
+ bbuf_free(cc->mbuf);
+ cc->mbuf = tbuf;
+ }
+
+ cc->not = 0;
+ }
+
+ return 0;
+}
+
+static int
and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
{
int r, not1, not2;
@@ -1672,13 +1747,13 @@ and_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
if (! ONIGENC_IS_SINGLEBYTE(enc)) {
if (not1 != 0 && not2 != 0) {
- r = or_code_range_buf(buf1, 0, buf2, 0, &pbuf);
+ r = or_code_range_buf(enc, buf1, 0, buf2, 0, &pbuf);
}
else {
r = and_code_range_buf(buf1, not1, buf2, not2, &pbuf);
if (r == 0 && not1 != 0) {
BBuf *tbuf;
- r = not_code_range_buf(pbuf, &tbuf);
+ r = not_code_range_buf(enc, pbuf, &tbuf);
if (r != 0) {
bbuf_free(pbuf);
return r;
@@ -1733,10 +1808,10 @@ or_cclass(CClassNode* dest, CClassNode* cc, OnigEncoding enc)
r = and_code_range_buf(buf1, 0, buf2, 0, &pbuf);
}
else {
- r = or_code_range_buf(buf1, not1, buf2, not2, &pbuf);
+ r = or_code_range_buf(enc, buf1, not1, buf2, not2, &pbuf);
if (r == 0 && not1 != 0) {
BBuf *tbuf;
- r = not_code_range_buf(pbuf, &tbuf);
+ r = not_code_range_buf(enc, pbuf, &tbuf);
if (r != 0) {
bbuf_free(pbuf);
return r;
@@ -1855,7 +1930,6 @@ static enum ReduceType ReduceTypeTable[6][6] = {
{RQ_ASIS, RQ_PQ_Q, RQ_DEL, RQ_AQ, RQ_AQ, RQ_DEL} /* '+?' */
};
-
extern void
onig_reduce_nested_qualifier(Node* pnode, Node* cnode)
{
@@ -1908,8 +1982,9 @@ onig_reduce_nested_qualifier(Node* pnode, Node* cnode)
enum TokenSyms {
TK_EOT = 0, /* end of token */
- TK_BYTE = 1,
- TK_RAW_BYTE = 2,
+ TK_RAW_BYTE = 1,
+ TK_CHAR,
+ TK_STRING,
TK_CODE_POINT,
TK_ANYCHAR,
TK_CHAR_TYPE,
@@ -1939,6 +2014,7 @@ typedef struct {
int base; /* is number: 8, 16 (used in [....]) */
UChar* backp;
union {
+ UChar* s;
int c;
OnigCodePoint code;
int anchor;
@@ -1970,8 +2046,11 @@ static int
fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
{
int low, up, syn_allow, non_low = 0;
- int c;
+ int r = 0;
+ OnigCodePoint c;
+ OnigEncoding enc = env->enc;
UChar* p = *src;
+ PFETCH_READY;
syn_allow = IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_INVALID_INTERVAL);
@@ -2025,12 +2104,13 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
PUNFETCH;
up = low; /* {n} : exact n times */
+ r = 2; /* fixed */
}
if (PEND) goto invalid;
PFETCH(c);
if (IS_SYNTAX_OP(env->syntax, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) {
- if (c != MC_ESC) goto invalid;
+ if (c != MC_ESC(enc)) goto invalid;
PFETCH(c);
}
if (c != '}') goto invalid;
@@ -2043,7 +2123,7 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
tok->u.repeat.lower = low;
tok->u.repeat.upper = up;
*src = p;
- return 0;
+ return r; /* 0: normal {n,m}, 2: fixed {n} */
invalid:
if (syn_allow)
@@ -2056,8 +2136,11 @@ fetch_range_qualifier(UChar** src, UChar* end, OnigToken* tok, ScanEnv* env)
static int
fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
{
- int c;
+ int v;
+ OnigCodePoint c;
+ OnigEncoding enc = env->enc;
UChar* p = *src;
+ PFETCH_READY;
if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH;
@@ -2070,9 +2153,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
if (c != '-') return ONIGERR_META_CODE_SYNTAX;
if (PEND) return ONIGERR_END_PATTERN_AT_META;
PFETCH(c);
- if (c == MC_ESC) {
- c = fetch_escaped_value(&p, end, env);
- if (c < 0) return c;
+ if (c == MC_ESC(enc)) {
+ v = fetch_escaped_value(&p, end, env);
+ if (v < 0) return v;
+ c = (OnigCodePoint )v;
}
c = ((c & 0xff) | 0x80);
}
@@ -2095,9 +2179,10 @@ fetch_escaped_value(UChar** src, UChar* end, ScanEnv* env)
control:
if (PEND) return ONIGERR_END_PATTERN_AT_CONTROL;
PFETCH(c);
- if (c == MC_ESC) {
- c = fetch_escaped_value(&p, end, env);
- if (c < 0) return c;
+ if (c == MC_ESC(enc)) {
+ v = fetch_escaped_value(&p, end, env);
+ if (v < 0) return v;
+ c = (OnigCodePoint )v;
}
else if (c == '?')
c = 0177;
@@ -2129,11 +2214,13 @@ static int fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env);
static int
fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
{
- int r, len, is_num;
- int c = 0;
- OnigCodePoint code, first_code;
+ int r, is_num;
+ OnigCodePoint c = 0;
+ OnigCodePoint first_code;
+ OnigEncoding enc = env->enc;
UChar *name_end;
UChar *p = *src;
+ PFETCH_READY;
name_end = end;
r = 0;
@@ -2144,23 +2231,20 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
else {
first_code = ONIGENC_MBC_TO_CODE(env->enc, p, end);
PFETCH(c);
+ first_code = c;
if (c == '>')
return ONIGERR_EMPTY_GROUP_NAME;
- if (ONIGENC_IS_CODE_DIGIT(env->enc, first_code)) {
+ if (ONIGENC_IS_CODE_DIGIT(enc, c)) {
if (ref == 1)
is_num = 1;
else {
r = ONIGERR_INVALID_GROUP_NAME;
}
}
- else if (! ONIGENC_IS_CODE_WORD(env->enc, first_code)) {
+ else if (!ONIGENC_IS_CODE_WORD(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
-
- len = enc_len(env->enc, c);
- while (!PEND && len-- > 1)
- PFETCH(c);
}
while (!PEND) {
@@ -2169,35 +2253,28 @@ fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
PFETCH(c);
if (c == '>' || c == ')') break;
- len = enc_len(env->enc, c);
if (is_num == 1) {
- if (len == 1) {
- if (! ONIGENC_IS_CODE_DIGIT(env->enc, code)) {
- if (!ONIGENC_IS_CODE_WORD(env->enc, code))
- r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
- else
- r = ONIGERR_INVALID_GROUP_NAME;
- }
- }
- else {
- r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
+ if (! ONIGENC_IS_CODE_DIGIT(enc, c)) {
+ if (!ONIGENC_IS_CODE_WORD(enc, c))
+ r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
+ else
+ r = ONIGERR_INVALID_GROUP_NAME;
}
}
else {
- if (! ONIGENC_IS_CODE_WORD(env->enc, code)) {
+ if (!ONIGENC_IS_CODE_WORD(enc, c)) {
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
}
}
-
- while (!PEND && len-- > 1)
- PFETCH(c);
}
+
if (c != '>') {
r = ONIGERR_INVALID_GROUP_NAME;
name_end = end;
}
else {
- if (ONIGENC_IS_CODE_UPPER(env->enc, first_code))
+ if (ONIGENC_IS_CODE_ASCII(first_code) &&
+ ONIGENC_IS_CODE_UPPER(enc, first_code))
r = ONIGERR_INVALID_GROUP_NAME;
}
@@ -2216,22 +2293,21 @@ static int
fetch_name(UChar** src, UChar* end, UChar** rname_end, ScanEnv* env, int ref)
{
int r, len;
- int c = 0;
- OnigCodePoint code;
+ OnigCodePoint c = 0;
UChar *name_end;
+ OnigEncoding enc = env->enc;
UChar *p = *src;
+ PFETCH_READY;
r = 0;
while (!PEND) {
name_end = p;
- code = ONIGENC_MBC_TO_CODE(env->enc, p, end);
- len = enc_len(env->enc, c);
- PFETCH(c);
- if (len > 1)
+ if (enc_len(enc, p) > 1)
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
+ PFETCH(c);
if (c == '>' || c == ')') break;
- if (! ONIGENC_IS_CODE_DIGIT(env->enc, code))
+ if (! ONIGENC_IS_CODE_DIGIT(enc, c))
r = ONIGERR_INVALID_CHAR_IN_GROUP_NAME;
p += (len - 1);
@@ -2294,12 +2370,12 @@ find_str_position(OnigCodePoint s[], int n, UChar* from, UChar* to,
while (p < to) {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
- q = p + enc_len(enc, *p);
+ q = p + enc_len(enc, p);
if (x == s[0]) {
for (i = 1; i < n && q < to; i++) {
x = ONIGENC_MBC_TO_CODE(enc, q, to);
if (x != s[i]) break;
- q += enc_len(enc, *q);
+ q += enc_len(enc, q);
}
if (i >= n) {
if (IS_NOT_NULL(next))
@@ -2325,24 +2401,24 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
while (p < to) {
if (in_esc) {
in_esc = 0;
- p += enc_len(enc, *p);
+ p += enc_len(enc, p);
}
else {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
- q = p + enc_len(enc, *p);
+ q = p + enc_len(enc, p);
if (x == s[0]) {
for (i = 1; i < n && q < to; i++) {
x = ONIGENC_MBC_TO_CODE(enc, q, to);
if (x != s[i]) break;
- q += enc_len(enc, *q);
+ q += enc_len(enc, q);
}
if (i >= n) return 1;
- p += enc_len(enc, *p);
+ p += enc_len(enc, p);
}
else {
x = ONIGENC_MBC_TO_CODE(enc, p, to);
if (x == bad) return 0;
- else if (x == MC_ESC) in_esc = 1;
+ else if (x == MC_ESC(enc)) in_esc = 1;
p = q;
}
}
@@ -2353,10 +2429,13 @@ str_exist_check_with_esc(OnigCodePoint s[], int n, UChar* from, UChar* to,
static int
fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
{
- int c, num;
+ int num;
+ OnigCodePoint c, c2;
OnigSyntaxType* syn = env->syntax;
+ OnigEncoding enc = env->enc;
UChar* prev;
UChar* p = *src;
+ PFETCH_READY;
if (PEND) {
tok->type = TK_EOT;
@@ -2364,7 +2443,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
PFETCH(c);
- tok->type = TK_BYTE;
+ tok->type = TK_CHAR;
tok->base = 0;
tok->u.c = c;
if (c == ']') {
@@ -2373,7 +2452,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
else if (c == '-') {
tok->type = TK_CC_RANGE;
}
- else if (c == MC_ESC) {
+ else if (c == MC_ESC(enc)) {
if (! IS_SYNTAX_BV(syn, ONIG_SYN_BACKSLASH_ESCAPE_IN_CC))
goto end;
@@ -2407,17 +2486,27 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->type = TK_CHAR_TYPE;
tok->u.subtype = CTYPE_NOT_WHITE_SPACE;
break;
+ case 'h':
+ if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
+ tok->type = TK_CHAR_TYPE;
+ tok->u.subtype = CTYPE_XDIGIT;
+ break;
+ case 'H':
+ if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
+ tok->type = TK_CHAR_TYPE;
+ tok->u.subtype = CTYPE_NOT_XDIGIT;
+ break;
case 'p':
case 'P':
- if (PPEEK == '{' &&
+ c2 = PPEEK;
+ if (c2 == '{' &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
- int c2;
PFETCH(c2);
if (c2 == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
@@ -2432,14 +2521,17 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (PEND) break;
prev = p;
- if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
+ if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
- if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9)
- return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
+ if (!PEND) {
+ c2 = PPEEK;
+ if (ONIGENC_IS_CODE_XDIGIT(enc, c2))
+ return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
+ }
- if (p > prev + 1 && !PEND && PPEEK == '}') {
+ if (p > prev + enc_len(enc, prev) && !PEND && (PPEEK_IS('}'))) {
PINC;
tok->type = TK_CODE_POINT;
tok->base = 16;
@@ -2451,7 +2543,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -2467,12 +2559,12 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
- tok->type = TK_RAW_BYTE;
+ tok->type = TK_CODE_POINT;
tok->base = 16;
tok->u.c = num;
}
@@ -2483,7 +2575,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
PUNFETCH;
prev = p;
- num = scan_unsigned_octal_number(&p, end, 3, env->enc);
+ num = scan_unsigned_octal_number(&p, end, 3, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -2500,18 +2592,18 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (num < 0) return num;
if (tok->u.c != num) {
tok->u.c = num;
- tok->type = TK_RAW_BYTE;
+ tok->type = TK_CODE_POINT;
}
break;
}
}
else if (c == '[') {
- if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && PPEEK == ':') {
+ if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_POSIX_BRACKET) && (PPEEK_IS(':'))) {
OnigCodePoint send[] = { (OnigCodePoint )':', (OnigCodePoint )']' };
tok->backp = p; /* point at '[' is readed */
PINC;
- if (str_exist_check_with_esc(send, 2, p, end, (OnigCodePoint )']',
- env->enc)) {
+ if (str_exist_check_with_esc(send, 2, p, end,
+ (OnigCodePoint )']', enc)) {
tok->type = TK_POSIX_BRACKET_OPEN;
}
else {
@@ -2531,7 +2623,7 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
else if (c == '&') {
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_CCLASS_SET_OP) &&
- !PEND && PPEEK == '&') {
+ !PEND && (PPEEK_IS('&'))) {
PINC;
tok->type = TK_CC_AND;
}
@@ -2545,10 +2637,13 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
static int
fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
{
- int r, c, num;
+ int r, num;
+ OnigCodePoint c;
+ OnigEncoding enc = env->enc;
OnigSyntaxType* syn = env->syntax;
UChar* prev;
UChar* p = *src;
+ PFETCH_READY;
start:
if (PEND) {
@@ -2556,13 +2651,17 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
return tok->type;
}
- tok->type = TK_BYTE;
- tok->base = 0;
+ tok->type = TK_STRING;
+ tok->base = 0;
+ tok->backp = p;
+
PFETCH(c);
- if (c == MC_ESC) {
+ if (c == MC_ESC(enc)) {
if (PEND) return ONIGERR_END_PATTERN_AT_BACKSLASH;
+ tok->backp = p;
PFETCH(c);
+
tok->u.c = c;
tok->escaped = 1;
switch (c) {
@@ -2588,37 +2687,42 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->u.repeat.lower = 0;
tok->u.repeat.upper = 1;
greedy_check:
- if (!PEND && PPEEK == '?' &&
+ if (!PEND && PPEEK_IS('?') &&
IS_SYNTAX_OP(syn, ONIG_SYN_OP_QMARK_NON_GREEDY)) {
PFETCH(c);
tok->u.repeat.greedy = 0;
tok->u.repeat.possessive = 0;
}
- else if (!PEND && PPEEK == '+' &&
- ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
- tok->type != TK_INTERVAL) ||
- (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
- tok->type == TK_INTERVAL))) {
- PFETCH(c);
- tok->u.repeat.greedy = 1;
- tok->u.repeat.possessive = 1;
- }
else {
- tok->u.repeat.greedy = 1;
- tok->u.repeat.possessive = 0;
+ possessive_check:
+ if (!PEND && PPEEK_IS('+') &&
+ ((IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_REPEAT) &&
+ tok->type != TK_INTERVAL) ||
+ (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_PLUS_POSSESSIVE_INTERVAL) &&
+ tok->type == TK_INTERVAL))) {
+ PFETCH(c);
+ tok->u.repeat.greedy = 1;
+ tok->u.repeat.possessive = 1;
+ }
+ else {
+ tok->u.repeat.greedy = 1;
+ tok->u.repeat.possessive = 0;
+ }
}
break;
case '{':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_BRACE_INTERVAL)) break;
- tok->backp = p;
r = fetch_range_qualifier(&p, end, tok, env);
if (r < 0) return r; /* error */
- if (r > 0) {
- /* normal char */
- }
- else
+ if (r == 0) goto greedy_check;
+ else if (r == 2) { /* {n} */
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
+ goto possessive_check;
+
goto greedy_check;
+ }
+ /* r == 1 : normal char */
break;
case '|':
@@ -2698,6 +2802,18 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->u.subtype = CTYPE_NOT_DIGIT;
break;
+ case 'h':
+ if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
+ tok->type = TK_CHAR_TYPE;
+ tok->u.subtype = CTYPE_XDIGIT;
+ break;
+
+ case 'H':
+ if (! IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_H_XDIGIT)) break;
+ tok->type = TK_CHAR_TYPE;
+ tok->u.subtype = CTYPE_NOT_XDIGIT;
+ break;
+
case 'A':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_AZ_BUF_ANCHOR)) break;
begin_buf:
@@ -2738,14 +2854,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (PEND) break;
prev = p;
- if (PPEEK == '{' && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
+ if (PPEEK_IS('{') && IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_BRACE_HEX8)) {
PINC;
- num = scan_unsigned_hexadecimal_number(&p, end, 8, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 8, enc);
if (num < 0) return ONIGERR_TOO_BIG_WIDE_CHAR_VALUE;
- if (!PEND && ONIGENC_IS_CODE_XDIGIT(env->enc, *p) && p - prev >= 9)
- return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
+ if (!PEND) {
+ if (ONIGENC_IS_CODE_XDIGIT(enc, PPEEK))
+ return ONIGERR_TOO_LONG_WIDE_CHAR_VALUE;
+ }
- if (p > prev + 1 && !PEND && PPEEK == '}') {
+ if ((p > prev + enc_len(enc, prev)) && !PEND && PPEEK_IS('}')) {
PINC;
tok->type = TK_CODE_POINT;
tok->u.code = (OnigCodePoint )num;
@@ -2756,7 +2874,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
}
else if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_X_HEX2)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 2, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 2, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -2772,12 +2890,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
prev = p;
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_U_HEX4)) {
- num = scan_unsigned_hexadecimal_number(&p, end, 4, env->enc);
+ num = scan_unsigned_hexadecimal_number(&p, end, 4, enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
}
- tok->type = TK_RAW_BYTE;
+ tok->type = TK_CODE_POINT;
tok->base = 16;
tok->u.c = num;
}
@@ -2787,9 +2905,10 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '5': case '6': case '7': case '8': case '9':
PUNFETCH;
prev = p;
- num = onig_scan_unsigned_number(&p, end, env->enc);
- if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
- if (num > ONIG_MAX_BACKREF_NUM) return ONIGERR_TOO_BIG_BACKREF_NUMBER;
+ num = onig_scan_unsigned_number(&p, end, enc);
+ if (num < 0 || num > ONIG_MAX_BACKREF_NUM) {
+ goto skip_backref;
+ }
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_DECIMAL_BACKREF) &&
(num <= env->num_mem || num <= 9)) { /* This spec. from GNU regex */
@@ -2804,7 +2923,9 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
tok->u.backref.by_name = 0;
break;
}
- else if (c == '8' || c == '9') {
+
+ skip_backref:
+ if (c == '8' || c == '9') {
/* normal char */
p = prev; PINC;
break;
@@ -2815,7 +2936,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '0':
if (IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_OCTAL3)) {
prev = p;
- num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), env->enc);
+ num = scan_unsigned_octal_number(&p, end, (c == '0' ? 2:3), enc);
if (num < 0) return ONIGERR_TOO_BIG_NUMBER;
if (p == prev) { /* can't read nothing. */
num = 0; /* but, it's not error */
@@ -2901,16 +3022,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case 'p':
case 'P':
- if (PPEEK == '{' &&
+ if (PPEEK_IS('{') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CHAR_PROPERTY)) {
PINC;
tok->type = TK_CHAR_PROPERTY;
tok->u.prop.not = (c == 'P' ? 1 : 0);
if (IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_ESC_P_BRACE_CIRCUMFLEX_NOT)) {
- int c2;
- PFETCH(c2);
- if (c2 == '^') {
+ PFETCH(c);
+ if (c == '^') {
tok->u.prop.not = (tok->u.prop.not == 0 ? 1 : 0);
}
else
@@ -2925,9 +3045,12 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (num < 0) return num;
/* set_raw: */
if (tok->u.c != num) {
- tok->type = TK_RAW_BYTE;
+ tok->type = TK_CODE_POINT;
tok->u.c = num;
}
+ else { /* string */
+ p = tok->backp + enc_len(enc, tok->backp);
+ }
break;
}
}
@@ -2938,15 +3061,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
#ifdef USE_VARIABLE_META_CHARS
if ((c != ONIG_INEFFECTIVE_META_CHAR) &&
IS_SYNTAX_OP(syn, ONIG_SYN_OP_VARIABLE_META_CHARACTERS)) {
- if (c == MC_ANYCHAR)
+ if (c == MC_ANYCHAR(enc))
goto any_char;
- else if (c == MC_ANYTIME)
+ else if (c == MC_ANYTIME(enc))
goto anytime;
- else if (c == MC_ZERO_OR_ONE_TIME)
+ else if (c == MC_ZERO_OR_ONE_TIME(enc))
goto zero_or_one_time;
- else if (c == MC_ONE_OR_MORE_TIME)
+ else if (c == MC_ONE_OR_MORE_TIME(enc))
goto one_or_more_time;
- else if (c == MC_ANYCHAR_ANYTIME) {
+ else if (c == MC_ANYCHAR_ANYTIME(enc)) {
tok->type = TK_ANYCHAR_ANYTIME;
goto out;
}
@@ -2989,14 +3112,16 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
case '{':
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_BRACE_INTERVAL)) break;
- tok->backp = p;
r = fetch_range_qualifier(&p, end, tok, env);
if (r < 0) return r; /* error */
- if (r > 0) {
- /* normal char */
- }
- else
+ if (r == 0) goto greedy_check;
+ else if (r == 2) { /* {n} */
+ if (IS_SYNTAX_BV(syn, ONIG_SYN_FIXED_INTERVAL_IS_GREEDY_ONLY))
+ goto possessive_check;
+
goto greedy_check;
+ }
+ /* r == 1 : normal char */
break;
case '|':
@@ -3005,15 +3130,15 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
case '(':
- if (PPEEK == '?' &&
+ if (PPEEK_IS('?') &&
IS_SYNTAX_OP2(syn, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
- if (PPEEK == '#') {
+ if (PPEEK_IS('#')) {
PFETCH(c);
while (1) {
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
PFETCH(c);
- if (c == MC_ESC) {
+ if (c == MC_ESC(enc)) {
if (!PEND) PFETCH(c);
}
else {
@@ -3062,7 +3187,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
if (IS_EXTEND(env->option)) {
while (!PEND) {
PFETCH(c);
- if (ONIG_IS_NEWLINE(c))
+ if (ONIGENC_IS_CODE_NEWLINE(enc, c))
break;
}
goto start;
@@ -3076,6 +3201,7 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
break;
default:
+ /* string */
break;
}
}
@@ -3086,22 +3212,20 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
}
static int
-add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not,
- OnigEncoding enc)
+add_ctype_to_cc_by_range(CClassNode* cc, int ctype, int not, OnigEncoding enc,
+ int nsb, int nmb,
+ OnigCodePointRange *sbr, OnigCodePointRange *mbr)
{
- int i, r, nsb, nmb;
- OnigCodePointRange *sbr, *mbr;
+ int i, r;
OnigCodePoint j;
- r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr);
- if (r != 0) return r;
-
if (not == 0) {
for (i = 0; i < nsb; i++) {
for (j = sbr[i].from; j <= sbr[i].to; j++) {
- BITSET_SET_BIT(cc->bs, j);
+ BITSET_SET_BIT(cc->bs, j);
}
}
+
for (i = 0; i < nmb; i++) {
r = add_code_range_to_buf(&(cc->mbuf), mbr[i].from, mbr[i].to);
if (r != 0) return r;
@@ -3109,19 +3233,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not,
}
else {
OnigCodePoint prev = 0;
- for (i = 0; i < nsb; i++) {
- for (j = prev; j < sbr[i].from; j++) {
- BITSET_SET_BIT(cc->bs, j);
+
+ if (ONIGENC_MBC_MINLEN(enc) == 1) {
+ for (i = 0; i < nsb; i++) {
+ for (j = prev; j < sbr[i].from; j++) {
+ BITSET_SET_BIT(cc->bs, j);
+ }
+ prev = sbr[i].to + 1;
}
- prev = sbr[i].to + 1;
- }
- if (prev < 0x7f) {
- for (j = prev; j < 0x7f; j++) {
- BITSET_SET_BIT(cc->bs, j);
+ if (prev < 0x7f) {
+ for (j = prev; j < 0x7f; j++) {
+ BITSET_SET_BIT(cc->bs, j);
+ }
}
+
+ prev = 0x80;
}
- prev = 0x80;
for (i = 0; i < nmb; i++) {
if (prev < mbr[i].from) {
r = add_code_range_to_buf(&(cc->mbuf), prev, mbr[i].from - 1);
@@ -3135,17 +3263,23 @@ add_ctype_to_cc_by_list(CClassNode* cc, int ctype, int not,
}
}
- return r;
+ return 0;
}
static int
add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
{
int c, r;
+ int nsb, nmb;
+ OnigCodePointRange *sbr, *mbr;
OnigEncoding enc = env->enc;
- if (ONIGENC_CTYPE_SUPPORT_LEVEL(enc) != ONIGENC_CTYPE_SUPPORT_LEVEL_SB) {
- r = add_ctype_to_cc_by_list(cc, ctype, not, env->enc);
+ r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &nsb, &nmb, &sbr, &mbr);
+ if (r == 0) {
+ return add_ctype_to_cc_by_range(cc, ctype, not, env->enc,
+ nsb, nmb, sbr, mbr);
+ }
+ else if (r != ONIG_NO_SUPPORT_CONFIG) {
return r;
}
@@ -3203,7 +3337,8 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
}
else {
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
- if (! ONIGENC_IS_CODE_SB_WORD(enc, c) && ! ONIGENC_IS_MBC_HEAD(enc, c))
+ if ((ONIGENC_CODE_TO_MBCLEN(enc, c) > 0) &&
+ ! ONIGENC_IS_CODE_WORD(enc, c))
BITSET_SET_BIT(cc->bs, c);
}
}
@@ -3247,6 +3382,14 @@ parse_ctype_to_enc_ctype(int pctype, int* not)
ctype = ONIGENC_CTYPE_DIGIT;
*not = 1;
break;
+ case CTYPE_XDIGIT:
+ ctype = ONIGENC_CTYPE_XDIGIT;
+ *not = 0;
+ break;
+ case CTYPE_NOT_XDIGIT:
+ ctype = ONIGENC_CTYPE_XDIGIT;
+ *not = 1;
+ break;
default:
return ONIGERR_PARSER_BUG;
break;
@@ -3284,23 +3427,26 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
};
PosixBracketEntryType *pb;
- int not, i, c, r;
+ int not, i, r;
+ OnigCodePoint c;
+ OnigEncoding enc = env->enc;
UChar *p = *src;
+ PFETCH_READY;
- if (PPEEK == '^') {
+ if (PPEEK_IS('^')) {
PINC;
not = 1;
}
else
not = 0;
- if (end - p < POSIX_BRACKET_NAME_MAX_LEN + 1)
+ if (onigenc_strlen(enc, p, end) < POSIX_BRACKET_NAME_MAX_LEN + 2)
goto not_posix_bracket;
for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
- if (onig_strncmp(p, pb->name, pb->len) == 0) {
- p += pb->len;
- if (end - p < 2 || *p != ':' || *(p+1) != ']')
+ if (onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0) {
+ p = onigenc_step(enc, p, end, pb->len);
+ if (onigenc_with_ascii_strncmp(enc, p, end, ":]", 2) != 0)
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
r = add_ctype_to_cc(cc, pb->ctype, not, env);
@@ -3319,9 +3465,9 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
PINC;
if (++i > POSIX_BRACKET_CHECK_LIMIT_LENGTH) break;
}
- if (c == ':' && !PEND) {
+ if (c == ':' && ! PEND) {
PINC;
- if (!PEND) {
+ if (! PEND) {
PFETCH(c);
if (c == ']')
return ONIGERR_INVALID_POSIX_BRACKET_TYPE;
@@ -3332,7 +3478,7 @@ parse_posix_bracket(CClassNode* cc, UChar** src, UChar* end, ScanEnv* env)
}
static int
-property_name_to_ctype(UChar* p, UChar* end)
+property_name_to_ctype(UChar* p, UChar* end, OnigEncoding enc)
{
static PosixBracketEntryType PBS[] = {
{ "Alnum", ONIGENC_CTYPE_ALNUM, 5 },
@@ -3354,9 +3500,10 @@ property_name_to_ctype(UChar* p, UChar* end)
PosixBracketEntryType *pb;
int len;
- len = end - p;
+ len = onigenc_strlen(enc, p, end);
for (pb = PBS; IS_NOT_NULL(pb->name); pb++) {
- if (len == pb->len && onig_strncmp(p, pb->name, pb->len) == 0)
+ if (len == pb->len &&
+ onigenc_with_ascii_strncmp(enc, p, end, pb->name, pb->len) == 0)
return pb->ctype;
}
@@ -3367,8 +3514,10 @@ static int
fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
{
int ctype;
+ OnigCodePoint c;
+ OnigEncoding enc = env->enc;
UChar *prev, *start, *p = *src;
- int c;
+ PFETCH_READY;
/* 'IsXXXX' => 'XXXX' */
if (!PEND &&
@@ -3392,7 +3541,7 @@ fetch_char_property_to_ctype(UChar** src, UChar* end, ScanEnv* env)
prev = p;
PFETCH(c);
if (c == '}') {
- ctype = property_name_to_ctype(start, prev);
+ ctype = property_name_to_ctype(start, prev, enc);
if (ctype < 0) break;
*src = p;
@@ -3499,12 +3648,26 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
}
}
else {
+#if 0
if (intype == CCV_CODE_POINT && *type == CCV_SB &&
ONIGENC_IS_CONTINUOUS_SB_MB(env->enc)) {
bitset_set_range(cc->bs, (int )*vs, 0x7f);
r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )0x80, v);
if (r < 0) return r;
}
+#else
+ if (intype == CCV_CODE_POINT && *type == CCV_SB) {
+ if (*vs > v) {
+ if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_EMPTY_RANGE_IN_CC))
+ goto ccs_range_end;
+ else
+ return ONIGERR_EMPTY_RANGE_IN_CHAR_CLASS;
+ }
+ bitset_set_range(cc->bs, (int )*vs, (int )(v < 0xff ? v : 0xff));
+ r = add_code_range(&(cc->mbuf), env, (OnigCodePoint )*vs, v);
+ if (r < 0) return r;
+ }
+#endif
else
return ONIGERR_MISMATCH_CODE_LENGTH_IN_CLASS_RANGE;
}
@@ -3528,22 +3691,24 @@ next_state_val(CClassNode* cc, OnigCodePoint *vs, OnigCodePoint v,
}
static int
-char_exist_check(UChar c, UChar* from, UChar* to, int ignore_escaped,
+code_exist_check(OnigCodePoint c, UChar* from, UChar* end, int ignore_escaped,
OnigEncoding enc)
{
int in_esc;
+ OnigCodePoint code;
UChar* p = from;
+ PFETCH_READY;
in_esc = 0;
- while (p < to) {
+ while (! PEND) {
if (ignore_escaped && in_esc) {
in_esc = 0;
}
else {
- if (*p == c) return 1;
- if (*p == MC_ESC) in_esc = 1;
+ PFETCH(code);
+ if (code == c) return 1;
+ if (code == MC_ESC(enc)) in_esc = 1;
}
- p += enc_len(enc, *p);
}
return 0;
}
@@ -3566,7 +3731,7 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
prev_cc = (CClassNode* )NULL;
*np = NULL_NODE;
r = fetch_token_in_cc(tok, src, end, env);
- if (r == TK_BYTE && tok->u.c == '^' && tok->escaped == 0) {
+ if (r == TK_CHAR && tok->u.c == '^' && tok->escaped == 0) {
neg = 1;
r = fetch_token_in_cc(tok, src, end, env);
}
@@ -3576,11 +3741,12 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
if (r < 0) return r;
if (r == TK_CC_CLOSE) {
- if (! char_exist_check(']', *src, env->pattern_end, 1, env->enc))
+ if (! code_exist_check((OnigCodePoint )']',
+ *src, env->pattern_end, 1, env->enc))
return ONIGERR_EMPTY_CHAR_CLASS;
CC_ESC_WARN(env, "]");
- r = tok->type = TK_BYTE; /* allow []...] */
+ r = tok->type = TK_CHAR; /* allow []...] */
}
*np = node = node_new_cclass();
@@ -3593,58 +3759,69 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
while (r != TK_CC_CLOSE) {
fetched = 0;
switch (r) {
- case TK_BYTE:
- len = enc_len(env->enc, tok->u.c);
+ case TK_CHAR:
+ len = ONIGENC_CODE_TO_MBCLEN(env->enc, tok->u.c);
if (len > 1) {
- PUNFETCH;
- v = ONIGENC_MBC_TO_CODE(env->enc, p, end);
- p += len;
in_type = CCV_CODE_POINT;
}
else {
sb_char:
- v = (OnigCodePoint )tok->u.c;
in_type = CCV_SB;
}
+ v = (OnigCodePoint )tok->u.c;
in_israw = 0;
goto val_entry2;
break;
case TK_RAW_BYTE:
- len = enc_len(env->enc, tok->u.c);
- if (len > 1 && tok->base != 0) { /* tok->base != 0 : octal or hexadec. */
+ /* tok->base != 0 : octal or hexadec. */
+ if (! ONIGENC_IS_SINGLEBYTE(env->enc) && tok->base != 0) {
UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
- UChar* bufp = buf;
UChar* bufe = buf + ONIGENC_CODE_TO_MBC_MAXLEN;
+ UChar* psave = p;
int i, base = tok->base;
- if (len > ONIGENC_CODE_TO_MBC_MAXLEN) {
- bufp = (UChar* )xmalloc(len);
- if (IS_NULL(bufp)) {
- r = ONIGERR_MEMORY;
- goto err;
+ buf[0] = tok->u.c;
+ for (i = 1; i < ONIGENC_MBC_MAXLEN(env->enc); i++) {
+ r = fetch_token_in_cc(tok, &p, end, env);
+ if (r < 0) goto err;
+ if (r != TK_RAW_BYTE || tok->base != base) {
+ fetched = 1;
+ break;
}
- bufe = bufp + len;
+ buf[i] = tok->u.c;
}
- bufp[0] = tok->u.c;
- for (i = 1; i < len; i++) {
- r = fetch_token_in_cc(tok, &p, end, env);
- if (r < 0) goto raw_byte_err;
- if (r != TK_RAW_BYTE || tok->base != base) break;
- bufp[i] = tok->u.c;
+
+ if (i < ONIGENC_MBC_MINLEN(env->enc)) {
+ r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
+ goto err;
}
+
+ len = enc_len(env->enc, buf);
if (i < len) {
r = ONIGERR_TOO_SHORT_MULTI_BYTE_STRING;
- raw_byte_err:
- if (bufp != buf) xfree(bufp);
goto err;
}
- v = ONIGENC_MBC_TO_CODE(env->enc, bufp, bufe);
- if (bufp != buf) xfree(bufp);
- in_type = CCV_CODE_POINT;
+ else if (i > len) { /* fetch back */
+ p = psave;
+ for (i = 1; i < len; i++) {
+ r = fetch_token_in_cc(tok, &p, end, env);
+ }
+ fetched = 0;
+ }
+
+ if (i == 1) {
+ v = (OnigCodePoint )buf[0];
+ goto raw_single;
+ }
+ else {
+ v = ONIGENC_MBC_TO_CODE(env->enc, buf, bufe);
+ in_type = CCV_CODE_POINT;
+ }
}
else {
v = (OnigCodePoint )tok->u.c;
+ raw_single:
in_type = CCV_SB;
}
in_israw = 1;
@@ -3838,8 +4015,17 @@ parse_char_class(Node** np, OnigToken* tok, UChar** src, UChar* end,
is_empty = (IS_NULL(cc->mbuf) ? 1 : 0);
if (is_empty != 0)
BITSET_IS_EMPTY(cc->bs, is_empty);
- if (is_empty == 0)
- BITSET_SET_BIT(cc->bs, ONIG_NEWLINE);
+
+ if (is_empty == 0) {
+#define NEWLINE_CODE 0x0a
+
+ if (ONIGENC_IS_CODE_NEWLINE(env->enc, NEWLINE_CODE)) {
+ if (ONIGENC_CODE_TO_MBCLEN(env->enc, NEWLINE_CODE) == 1)
+ BITSET_SET_BIT(cc->bs, NEWLINE_CODE);
+ else
+ add_code_range(&(cc->mbuf), env, NEWLINE_CODE, NEWLINE_CODE);
+ }
+ }
}
*src = p;
return 0;
@@ -3858,17 +4044,20 @@ static int
parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
ScanEnv* env)
{
+ int r, num;
+ int list_capture;
Node *target;
OnigOptionType option;
- int r, c, num;
- int list_capture;
+ OnigEncoding enc = env->enc;
+ OnigCodePoint c;
UChar* p = *src;
+ PFETCH_READY;
*np = NULL;
if (PEND) return ONIGERR_END_PATTERN_WITH_UNMATCHED_PARENTHESIS;
option = env->option;
- if (PPEEK == '?' &&
+ if (PPEEK_IS('?') &&
IS_SYNTAX_OP2(env->syntax, ONIG_SYN_OP2_QMARK_GROUP_EFFECT)) {
PINC;
if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
@@ -4016,7 +4205,7 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
else if (c == ':') {
OnigOptionType prev = env->option;
- env->option = option;
+ env->option = option;
r = fetch_token(tok, &p, end, env);
if (r < 0) return r;
r = parse_subexp(&target, tok, term, &p, end, env);
@@ -4072,7 +4261,6 @@ parse_effect(Node** np, OnigToken* tok, int term, UChar** src, UChar* end,
return 0;
}
-
static char* PopularQStr[] = {
"?", "*", "+", "??", "*?", "+?"
};
@@ -4137,7 +4325,7 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env)
if (onig_verb_warn != onig_null_warn) {
onig_snprintf_with_pattern(buf, WARN_BUFSIZE, env->enc,
env->pattern, env->pattern_end,
- "nested repeat operator '%s and %s' was replaced with '%s'",
+ "nested repeat operator %s and %s was replaced with '%s'",
PopularQStr[targetq_num], PopularQStr[nestq_num],
ReduceQStr[ReduceTypeTable[targetq_num][nestq_num]]);
(*onig_verb_warn)(buf);
@@ -4165,74 +4353,59 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env)
return 0;
}
-#ifdef USE_FOLD_MATCH
static int
-make_alt_node_from_fold_info(OnigEncFoldMatchInfo* info, Node** node)
+make_compound_alt_node_from_cc(OnigAmbigType ambig_flag, OnigEncoding enc,
+ CClassNode* cc, Node** root)
{
- int i;
- UChar *s, *end;
- Node *root, **ptail, *snode;
-
- ptail = &root;
- for (i = 0; i < info->target_num; i++) {
- s = info->target_str[i];
- end = s + info->target_byte_len[i];
- /* ex.
- U+00DF match "ss" and "SS, but not match "Ss".
- So, string nodes must be raw.
- */
- snode = node_new_str_raw(s, end);
- CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
-
- *ptail = node_new_alt(snode, NULL_NODE);
- CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY);
- ptail = &(NCONS(*ptail).right);
- }
- *ptail = NULL_NODE;
- *node = root;
- return 0;
-}
-
-static int
-make_fold_alt_node_from_cc(OnigEncoding enc, CClassNode* cc, Node** root)
-{
- int i, j, flen, len, ncode, n;
- UChar *s, *end, buf[ONIGENC_CODE_TO_MBC_MAXLEN];
- OnigCodePoint* codes;
- Node **ptail, *snode;
- OnigEncFoldMatchInfo* info;
+ int r, i, j, k, clen, len, ncode, n;
+ UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN];
+ Node **ptail, *snode = NULL_NODE;
+ OnigCompAmbigCodes* ccs;
+ OnigCompAmbigCodeItem* ci;
+ OnigAmbigType amb;
+ n = 0;
*root = NULL_NODE;
ptail = root;
- ncode = ONIGENC_GET_ALL_FOLD_MATCH_CODE(enc, &codes);
- n = 0;
- for (i = 0; i < ncode; i++) {
- if (onig_is_code_in_cc(enc, codes[i], cc)) {
- len = ONIGENC_CODE_TO_MBC(enc, codes[i], buf);
- flen = ONIGENC_GET_FOLD_MATCH_INFO(enc, buf, buf + len, &info);
- if (flen > 0) { /* fold */
- for (j = 0; j < info->target_num; j++) {
- s = info->target_str[j];
- end = s + info->target_byte_len[j];
- if (onig_strncmp(s, buf, enc_len(enc, *s)) == 0)
- continue; /* ignore single char. */
-
- snode = node_new_str_raw(s, end);
- CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
-
- *ptail = node_new_alt(snode, NULL_NODE);
- CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY);
- ptail = &(NCONS(*ptail).right);
- n++;
- }
+
+ for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
+ if ((amb & ambig_flag) == 0) continue;
+
+ ncode = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs);
+ for (i = 0; i < ncode; i++) {
+ if (onig_is_code_in_cc(enc, ccs[i].code, cc)) {
+ for (j = 0; j < ccs[i].n; j++) {
+ ci = &(ccs[i].items[j]);
+ if (ci->len > 1) { /* compound only */
+ if (cc->not) clear_not_flag_cclass(cc, enc);
+
+ clen = ci->len;
+ for (k = 0; k < clen; k++) {
+ len = ONIGENC_CODE_TO_MBC(enc, ci->code[k], buf);
+
+ if (k == 0) {
+ snode = node_new_str_raw(buf, buf + len);
+ CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
+ }
+ else {
+ r = onig_node_str_cat(snode, buf, buf + len);
+ if (r < 0) return r;
+ }
+ }
+
+ *ptail = node_new_alt(snode, NULL_NODE);
+ CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY);
+ ptail = &(NCONS(*ptail).right);
+ n++;
+ }
+ }
}
}
}
return n;
}
-#endif
static int
parse_exp(Node** np, OnigToken* tok, int term,
@@ -4281,76 +4454,22 @@ parse_exp(Node** np, OnigToken* tok, int term,
else goto tk_byte;
break;
- case TK_BYTE:
+ case TK_STRING:
tk_byte:
{
- *np = node_new_str_char((UChar )tok->u.c);
+ *np = node_new_str(tok->backp, *src);
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
while (1) {
- len = enc_len(env->enc, tok->u.c);
- if (len > 1) {
- r = onig_node_str_cat(*np, *src, *src + len - 1);
- if (r < 0) return r;
- *src += (len - 1);
- }
-
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
- if (r != TK_BYTE) break;
+ if (r != TK_STRING) break;
- r = node_str_cat_char(*np, (UChar )tok->u.c);
+ r = onig_node_str_cat(*np, tok->backp, *src);
if (r < 0) return r;
}
- fold_entry:
-#ifdef USE_FOLD_MATCH
- if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) {
- int flen, ret;
- Node *root, **ptail, *work, *snode, *anode;
- UChar *p, *pprev;
- OnigEncFoldMatchInfo* fold_info;
- StrNode* sn = &(NSTRING(*np));
-
- ptail = &root;
- pprev = sn->s;
- for (p = sn->s; p < sn->end; ) {
- flen = ONIGENC_GET_FOLD_MATCH_INFO(env->enc, p, sn->end, &fold_info);
- if (flen > 0) { /* fold */
- ret = make_alt_node_from_fold_info(fold_info, &anode);
- if (ret != 0) return ret;
- work = node_new_list(anode, NULL);
- CHECK_NULL_RETURN_VAL(work, ONIGERR_MEMORY);
-
- if (pprev < p) {
- snode = node_new_str(pprev, p);
- CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
- *ptail = node_new_list(snode, work);
- CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY);
- }
- else {
- *ptail = work;
- }
- ptail = &(NCONS(work).right);
- p += flen;
- pprev = p;
- }
- else
- p += enc_len(env->enc, *p);
- }
- *ptail = NULL_NODE;
- if (IS_NOT_NULL(root)) {
- if (pprev < sn->end) {
- snode = node_new_str(pprev, sn->end);
- CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY);
- *ptail = node_new_list(snode, NULL_NODE);
- CHECK_NULL_RETURN_VAL(*ptail, ONIGERR_MEMORY);
- }
- onig_node_free(*np);
- *np = root;
- }
- }
-#endif
+ string_end:
targetp = np;
goto repeat;
}
@@ -4359,22 +4478,19 @@ parse_exp(Node** np, OnigToken* tok, int term,
case TK_RAW_BYTE:
tk_raw_byte:
{
- int expect_len;
-
*np = node_new_str_raw_char((UChar )tok->u.c);
CHECK_NULL_RETURN_VAL(*np, ONIGERR_MEMORY);
- expect_len = enc_len(env->enc, tok->u.c);
len = 1;
while (1) {
r = fetch_token(tok, src, end, env);
if (r < 0) return r;
if (r != TK_RAW_BYTE) {
#ifndef NUMBERED_CHAR_IS_NOT_CASE_AMBIG
- if (len >= expect_len) {
+ if (len >= enc_len(env->enc, NSTRING(*np).s)) {
NSTRING_CLEAR_RAW(*np);
}
#endif
- goto fold_entry;
+ goto string_end;
}
r = node_str_cat_char(*np, (UChar )tok->u.c);
@@ -4403,7 +4519,7 @@ parse_exp(Node** np, OnigToken* tok, int term,
OnigCodePoint end_op[2];
UChar *qstart, *qend, *nextp;
- end_op[0] = (OnigCodePoint )MC_ESC;
+ end_op[0] = (OnigCodePoint )MC_ESC(env->enc);
end_op[1] = (OnigCodePoint )'E';
qstart = *src;
qend = find_str_position(end_op, 2, qstart, end, &nextp, env->enc);
@@ -4429,6 +4545,8 @@ parse_exp(Node** np, OnigToken* tok, int term,
case CTYPE_NOT_WHITE_SPACE:
case CTYPE_DIGIT:
case CTYPE_NOT_DIGIT:
+ case CTYPE_XDIGIT:
+ case CTYPE_NOT_XDIGIT:
{
CClassNode* cc;
int ctype, not;
@@ -4456,27 +4574,65 @@ parse_exp(Node** np, OnigToken* tok, int term,
break;
case TK_CC_OPEN:
- r = parse_char_class(np, tok, src, end, env);
- if (r != 0) return r;
+ {
+ CClassNode* cc;
-#ifdef USE_FOLD_MATCH
- if (IS_IGNORECASE(env->option) && ONIGENC_IS_FOLD_MATCH(env->enc)) {
- int res;
- Node *alt_root, *work;
- CClassNode* cc = &(NCCLASS(*np));
-
- res = make_fold_alt_node_from_cc(env->enc, cc, &alt_root);
- if (res < 0) return res;
- if (res > 0) {
- work = node_new_alt(*np, alt_root);
- if (IS_NULL(work)) {
- onig_node_free(alt_root);
- return ONIGERR_MEMORY;
- }
- *np = work;
+ r = parse_char_class(np, tok, src, end, env);
+ if (r != 0) return r;
+
+ cc = &(NCCLASS(*np));
+
+ if (IS_IGNORECASE(env->option)) {
+ int i, n, in_cc;
+ OnigPairAmbigCodes* ccs;
+ BitSetRef bs = cc->bs;
+ OnigAmbigType amb;
+
+ for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) {
+ if ((amb & env->ambig_flag) == 0) continue;
+
+ n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(env->enc, amb, &ccs);
+ for (i = 0; i < n; i++) {
+ in_cc = onig_is_code_in_cc(env->enc, ccs[i].from, cc);
+
+ if ((in_cc != 0 && cc->not == 0) || (in_cc == 0 && cc->not != 0)) {
+ if (ONIGENC_MBC_MINLEN(env->enc) > 1 ||
+ ccs[i].from >= SINGLE_BYTE_SIZE) {
+ /* if (cc->not) clear_not_flag_cclass(cc, env->enc); */
+ add_code_range(&(cc->mbuf), env, ccs[i].to, ccs[i].to);
+ }
+ else {
+ if (BITSET_AT(bs, ccs[i].from)) {
+ /* /(?i:[^A-C])/.match("a") ==> fail. */
+ BITSET_SET_BIT(bs, ccs[i].to);
+ }
+ if (BITSET_AT(bs, ccs[i].to)) {
+ BITSET_SET_BIT(bs, ccs[i].from);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if (IS_IGNORECASE(env->option) &&
+ (env->ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) {
+ int res;
+ Node *alt_root, *work;
+
+ res = make_compound_alt_node_from_cc(env->ambig_flag, env->enc,
+ cc, &alt_root);
+ if (res < 0) return res;
+ if (res > 0) {
+ work = node_new_alt(*np, alt_root);
+ if (IS_NULL(work)) {
+ onig_node_free(alt_root);
+ return ONIGERR_MEMORY;
+ }
+ *np = work;
+ }
}
}
-#endif
break;
case TK_ANYCHAR:
@@ -4522,7 +4678,6 @@ parse_exp(Node** np, OnigToken* tok, int term,
*np = node_new_empty();
}
else {
- *src = tok->backp;
goto tk_byte;
}
break;
@@ -4685,6 +4840,7 @@ onig_parse_make_tree(Node** root, UChar* pattern, UChar* end, regex_t* reg,
scan_env_clear(env);
env->option = reg->options;
+ env->ambig_flag = reg->ambig_flag;
env->enc = reg->enc;
env->syntax = reg->syntax;
env->pattern = pattern;
diff --git a/regparse.h b/regparse.h
index a4acd92208..5982ec8081 100644
--- a/regparse.h
+++ b/regparse.h
@@ -1,12 +1,33 @@
+#ifndef REGPARSE_H
+#define REGPARSE_H
/**********************************************************************
-
regparse.h - Oniguruma (regular expression library)
-
- Copyright (C) 2003-2004 K.Kosako (kosako@sofnec.co.jp)
-
**********************************************************************/
-#ifndef REGPARSE_H
-#define REGPARSE_H
+/*-
+ * Copyright (c) 2002-2004 K.Kosako <kosako AT sofnec DOT co DOT jp>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
#include "regint.h"
@@ -43,7 +64,8 @@
#define CTYPE_NOT_WHITE_SPACE (1<<3)
#define CTYPE_DIGIT (1<<4)
#define CTYPE_NOT_DIGIT (1<<5)
-
+#define CTYPE_XDIGIT (1<<6)
+#define CTYPE_NOT_XDIGIT (1<<7)
#define ANCHOR_ANYCHAR_STAR_MASK (ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL)
#define ANCHOR_END_BUF_MASK (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)
@@ -52,23 +74,23 @@
#define EFFECT_OPTION (1<<1)
#define EFFECT_STOP_BACKTRACK (1<<2)
-#define REPEAT_INFINITE -1
-#define IS_REPEAT_INFINITE(n) ((n) == REPEAT_INFINITE)
-
#define NODE_STR_MARGIN 16
#define NODE_STR_BUF_SIZE 24 /* sizeof(CClassNode) - sizeof(int)*4 */
#define NODE_BACKREFS_SIZE 7
#define NSTR_RAW (1<<0) /* by backslashed number */
-#define NSTR_CASE_AMBIG (1<<1)
-
-#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s)
-#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW
-#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW
-#define NSTRING_SET_CASE_AMBIG(node) (node)->u.str.flag |= NSTR_CASE_AMBIG
-#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0)
-#define NSTRING_IS_CASE_AMBIG(node) \
- (((node)->u.str.flag & NSTR_CASE_AMBIG) != 0)
+#define NSTR_AMBIG (1<<1)
+#define NSTR_AMBIG_REDUCE (1<<2)
+
+#define NSTRING_LEN(node) ((node)->u.str.end - (node)->u.str.s)
+#define NSTRING_SET_RAW(node) (node)->u.str.flag |= NSTR_RAW
+#define NSTRING_CLEAR_RAW(node) (node)->u.str.flag &= ~NSTR_RAW
+#define NSTRING_SET_AMBIG(node) (node)->u.str.flag |= NSTR_AMBIG
+#define NSTRING_SET_AMBIG_REDUCE(node) (node)->u.str.flag |= NSTR_AMBIG_REDUCE
+#define NSTRING_IS_RAW(node) (((node)->u.str.flag & NSTR_RAW) != 0)
+#define NSTRING_IS_AMBIG(node) (((node)->u.str.flag & NSTR_AMBIG) != 0)
+#define NSTRING_IS_AMBIG_REDUCE(node) \
+ (((node)->u.str.flag & NSTR_AMBIG_REDUCE) != 0)
#define BACKREFS_P(br) \
(IS_NOT_NULL((br)->back_dynamic) ? (br)->back_dynamic : (br)->back_static);
@@ -109,20 +131,19 @@ typedef struct {
} QualifierNode;
/* status bits */
-#define NST_MIN_FIXED (1<<0)
-#define NST_MAX_FIXED (1<<1)
-#define NST_CLEN_FIXED (1<<2)
-#define NST_MARK1 (1<<3)
-#define NST_MARK2 (1<<4)
-#define NST_MEM_BACKREFED (1<<5)
-#define NST_SIMPLE_REPEAT (1<<6) /* for stop backtrack optimization */
-
-#define NST_RECURSION (1<<7)
-#define NST_CALLED (1<<8)
-#define NST_ADDR_FIXED (1<<9)
-#define NST_NAMED_GROUP (1<<10)
-#define NST_NAME_REF (1<<11)
-#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in match stack. */
+#define NST_MIN_FIXED (1<<0)
+#define NST_MAX_FIXED (1<<1)
+#define NST_CLEN_FIXED (1<<2)
+#define NST_MARK1 (1<<3)
+#define NST_MARK2 (1<<4)
+#define NST_MEM_BACKREFED (1<<5)
+#define NST_STOP_BT_SIMPLE_REPEAT (1<<6)
+#define NST_RECURSION (1<<7)
+#define NST_CALLED (1<<8)
+#define NST_ADDR_FIXED (1<<9)
+#define NST_NAMED_GROUP (1<<10)
+#define NST_NAME_REF (1<<11)
+#define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */
#define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f)
#define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f)
@@ -135,7 +156,8 @@ typedef struct {
#define IS_EFFECT_MIN_FIXED(en) (((en)->state & NST_MIN_FIXED) != 0)
#define IS_EFFECT_MAX_FIXED(en) (((en)->state & NST_MAX_FIXED) != 0)
#define IS_EFFECT_CLEN_FIXED(en) (((en)->state & NST_CLEN_FIXED) != 0)
-#define IS_EFFECT_SIMPLE_REPEAT(en) (((en)->state & NST_SIMPLE_REPEAT) != 0)
+#define IS_EFFECT_STOP_BT_SIMPLE_REPEAT(en) \
+ (((en)->state & NST_STOP_BT_SIMPLE_REPEAT) != 0)
#define IS_EFFECT_NAMED_GROUP(en) (((en)->state & NST_NAMED_GROUP) != 0)
#define SET_CALL_RECURSION(node) (node)->u.call.state |= NST_RECURSION
@@ -227,9 +249,10 @@ typedef struct _Node {
(senv)->mem_nodes_dynamic : (senv)->mem_nodes_static)
typedef struct {
- OnigOptionType option;
- OnigEncoding enc;
- OnigSyntaxType* syntax;
+ OnigOptionType option;
+ OnigAmbigType ambig_flag;
+ OnigEncoding enc;
+ OnigSyntaxType* syntax;
BitStatusType capture_history;
BitStatusType bt_mem_start;
BitStatusType bt_mem_end;
@@ -267,6 +290,9 @@ extern int onig_node_str_cat P_((Node* node, UChar* s, UChar* end));
extern void onig_node_free P_((Node* node));
extern Node* onig_node_new_effect P_((int type));
extern Node* onig_node_new_anchor P_((int type));
+extern Node* onig_node_new_str P_((UChar* s, UChar* end));
+extern Node* onig_node_new_list P_((Node* left, Node* right));
+extern void onig_node_str_clear P_((Node* node));
extern int onig_free_node_list();
extern int onig_names_free P_((regex_t* reg));
extern int onig_parse_make_tree P_((Node** root, UChar* pattern, UChar* end, regex_t* reg, ScanEnv* env));