From 82cb9eaa3bb49a77df4452cfdff18f817ecf63a6 Mon Sep 17 00:00:00 2001 From: ksaito Date: Thu, 4 Nov 2004 14:43:08 +0000 Subject: * ascii.c, euc_jp.c, oniggnu.h, oniguruma.h, regcomp.c, regenc.c, regenc.h, regerror.c, regexec.c, reggnu.c, regint.h, regparse.c, regparse.h, sjis.c, utf8.c: imported Oni Guruma 3.4.0. * parse.y, re.c: Now mbclen() takes unsigned char as its argument. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@7206 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- regcomp.c | 816 +++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 434 insertions(+), 382 deletions(-) (limited to 'regcomp.c') diff --git a/regcomp.c b/regcomp.c index 3b62e18b43..3db7b3ad6a 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1,16 +1,42 @@ /********************************************************************** - regcomp.c - Oniguruma (regular expression library) - - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) - **********************************************************************/ +/*- + * Copyright (c) 2002-2004 K.Kosako + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + #include "regparse.h" #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif +/* + Caution: node should not be a string node. + (s and end member address break) +*/ static void swap_node(Node* a, Node* b) { @@ -120,33 +146,6 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) #endif /* USE_SUBEXP_CALL */ -#if 0 -static int -bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) -{ - int i; - int len, maxlen = 0; - - if (negative) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - return maxlen; -} -#endif - static int add_opcode(regex_t* reg, int opcode) { @@ -293,15 +292,15 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) { int op; - switch (mb_len) { - case 1: - if (ignore_case) { - switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; - } + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; } - else { + } + else { + switch (mb_len) { + case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; @@ -310,25 +309,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_EXACTMB3N; + break; - default: - op = OP_EXACTMBN; - break; + default: + op = OP_EXACTMBN; + break; + } } return op; } @@ -373,7 +372,7 @@ compile_call(CallNode* node, regex_t* reg) r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), - node->target); + node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; @@ -394,15 +393,14 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) static int add_compile_string_length(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int len; int op = select_str_opcode(mb_len, str_len, ignore_case); len = SIZE_OPCODE; - if (op == OP_EXACTMBN) - len += SIZE_LENGTH; + if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; @@ -412,7 +410,7 @@ add_compile_string_length(UChar* s, int mb_len, int str_len, static int add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) + regex_t* reg, int ignore_case) { int op = select_str_opcode(mb_len, str_len, ignore_case); add_opcode(reg, op); @@ -420,8 +418,12 @@ add_compile_string(UChar* s, int mb_len, int str_len, if (op == OP_EXACTMBN) add_length(reg, mb_len); - if (IS_NEED_STR_LEN_OP_EXACT(op)) - add_length(reg, str_len); + if (IS_NEED_STR_LEN_OP_EXACT(op)) { + if (op == OP_EXACTN_IC) + add_length(reg, mb_len * str_len); + else + add_length(reg, str_len); + } add_bytes(reg, s, mb_len * str_len); return 0; @@ -429,49 +431,37 @@ add_compile_string(UChar* s, int mb_len, int str_len, static int -compile_length_string_node(StrNode* sn, regex_t* reg) +compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig, ic; + int rlen, r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; rlen = 0; for (; p < sn->end; ) { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && ambig == 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); } else { r = add_compile_string_length(prev, prev_len, slen, reg, ambig); rlen += r; - - if (ic != 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - prev = p; slen = 1; prev_len = len; } - p += len; } r = add_compile_string_length(prev, prev_len, slen, reg, ambig); @@ -489,49 +479,33 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) } static int -compile_string_node(StrNode* sn, regex_t* reg) +compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig, ic; + int r, len, prev_len, slen, ambig; OnigEncoding enc = reg->enc; - UChar *p, *prev; + UChar *p, *prev, *end; + StrNode* sn; + sn = &(NSTRING(node)); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + end = sn->end; + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) - ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; - + prev_len = enc_len(enc, p); p += prev_len; slen = 1; - for (; p < sn->end; ) { - len = enc_len(enc, *p); + for (; p < end; ) { + len = enc_len(enc, p); if (len == prev_len) { slen++; - if (ic != 0 && len == 1) { - if (ambig == 0) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } } else { r = add_compile_string(prev, prev_len, slen, reg, ambig); if (r) return r; - if (ic != 0 && len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; prev = p; slen = 1; @@ -584,8 +558,7 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) len = SIZE_OPCODE + SIZE_BITSET; } else { - if (bitset_is_empty(cc->bs)) { - /* SIZE_BITSET is included in mbuf->used. */ + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { @@ -613,7 +586,7 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) r = add_bitset(reg, cc->bs); } else { - if (bitset_is_empty(cc->bs)) { + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); else add_opcode(reg, OP_CCLASS_MB); @@ -649,7 +622,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, - sizeof(OnigRepeatRange) * n); + sizeof(OnigRepeatRange) * n); CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); reg->repeat_range = p; reg->repeat_range_alloc = n; @@ -665,7 +638,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) static int compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, - regex_t* reg) + regex_t* reg) { int r; int num_repeat = reg->num_repeat; @@ -685,15 +658,16 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, if (r) return r; if ( - #ifdef USE_SUBEXP_CALL +#ifdef USE_SUBEXP_CALL reg->num_call > 0 || - #endif +#endif IS_QUALIFIER_IN_REPEAT(qn)) { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); } else { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); } + if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; @@ -715,9 +689,9 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) if (NTYPE(qn->target) == N_ANYCHAR) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -750,7 +724,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } @@ -874,7 +849,8 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUALIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); @@ -934,18 +910,16 @@ compile_option_node(EffectNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; + } - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } - else { - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; - } return r; } @@ -992,7 +966,7 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; @@ -1082,7 +1056,7 @@ compile_effect_node(EffectNode* node, regex_t* reg) break; case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { + if (IS_EFFECT_STOP_BT_SIMPLE_REPEAT(node)) { QualifierNode* qn = &NQUALIFIER(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; @@ -1267,7 +1241,7 @@ compile_length_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_length_string_raw_node(&(NSTRING(node)), reg); else - r = compile_length_string_node(&(NSTRING(node)), reg); + r = compile_length_string_node(node, reg); break; case N_CCLASS: @@ -1365,7 +1339,7 @@ compile_tree(Node* node, regex_t* reg) if (NSTRING_IS_RAW(node)) r = compile_string_raw_node(&(NSTRING(node)), reg); else - r = compile_string_node(&(NSTRING(node)), reg); + r = compile_string_node(node, reg); break; case N_CCLASS: @@ -1421,8 +1395,14 @@ compile_tree(Node* node, regex_t* reg) } else { int* p; - add_opcode(reg, (IS_IGNORECASE(reg->options) ? - OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); + + if (IS_IGNORECASE(reg->options)) { + add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + add_opcode(reg, OP_BACKREF_MULTI); + } + if (r) return r; add_length(reg, br->back_num); if (r) return r; @@ -2053,7 +2033,7 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) StrNode* sn = &(NSTRING(node)); UChar *s = sn->s; while (s < sn->end) { - s += enc_len(reg->enc, *s); + s += enc_len(reg->enc, s); (*len)++; } } @@ -2144,7 +2124,7 @@ onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) { int found; - if (code >= SINGLE_BYTE_SIZE) { + if (ONIGENC_MBC_MINLEN(enc) > 1 || (code >= SINGLE_BYTE_SIZE)) { if (IS_NULL(cc->mbuf)) { found = 0; } @@ -2309,7 +2289,7 @@ is_not_included(Node* x, Node* y, regex_t* reg) CClassNode* cc = &(NCCLASS(y)); code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + enc_len(reg->enc, c)); + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); } break; @@ -2320,18 +2300,9 @@ is_not_included(Node* x, Node* y, regex_t* reg) StrNode* ys = &(NSTRING(y)); len = NSTRING_LEN(x); if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { - UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int plen, qlen; - for (p = ys->s, q = xs->s; q < xs->end; ) { - plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); - qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); - if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) - return 1; - p += enc_len(reg->enc, *p); - q += enc_len(reg->enc, *q); - } + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; } else { for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { @@ -2388,8 +2359,12 @@ get_head_value_node(Node* node, int exact, regex_t* reg) if (exact != 0 && !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) +#if 0 + UChar* tmp = sn->s; + if (! ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, + &tmp, sn->end)) n = node; +#endif } else { n = node; @@ -2946,7 +2921,7 @@ next_setup(Node* node, Node* next_node, regex_t* reg) if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); swap_node(node, en); NEFFECT(node).target = en; } @@ -2965,9 +2940,114 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) +static int +divide_ambig_string_node(Node* node, regex_t* reg) +{ + StrNode* sn = &NSTRING(node); + int ambig, prev_ambig; + UChar *prev, *p, *end, *prev_start, *start, *tmp, *wp; + Node *snode; + Node *root = NULL_NODE; + Node **tailp = (Node** )0; + + start = prev_start = p = sn->s; + end = sn->end; + if (p >= end) return 0; + + prev_ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, reg->ambig_flag, &p, end); + + while (p < end) { + prev = p; + if (prev_ambig != (ambig = ONIGENC_IS_MBC_AMBIGUOUS(reg->enc, + reg->ambig_flag, &p, end))) { + + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < prev) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != prev) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, prev); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(root).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + prev_ambig = ambig; + prev_start = prev; + } + } + + if (prev_start == start) { + if (prev_ambig != 0) { + NSTRING_SET_AMBIG(node); + tmp = start; + wp = start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + if (wp != sn->end) NSTRING_SET_AMBIG_REDUCE(node); + sn->end = wp; + } + } + else { + if (prev_ambig != 0) { + tmp = prev_start; + wp = prev_start; + while (tmp < end) { + wp += ONIGENC_MBC_TO_NORMALIZE(reg->enc, reg->ambig_flag, + &tmp, end, wp); + } + snode = onig_node_new_str(prev_start, wp); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + NSTRING_SET_AMBIG(snode); + if (wp != end) NSTRING_SET_AMBIG_REDUCE(snode); + } + else { + snode = onig_node_new_str(prev_start, end); + CHECK_NULL_RETURN_VAL(snode, ONIGERR_MEMORY); + } + + if (tailp == (Node** )0) { + root = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(root, ONIGERR_MEMORY); + tailp = &(NCONS(node).right); + } + else { + *tailp = onig_node_new_list(snode, NULL); + CHECK_NULL_RETURN_VAL(*tailp, ONIGERR_MEMORY); + tailp = &(NCONS(*tailp).right); + } + + swap_node(node, root); + onig_node_str_clear(root); /* should be after swap! */ + onig_node_free(root); /* free original string node */ + } + + return 0; +} + +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -3005,33 +3085,11 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case N_CCLASS: - if (IS_IGNORECASE(reg->options)) { - int i; - UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - BitSetRef bs = NCCLASS(node).bs; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - c = (UChar )i; - ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); - if (*lowbuf != c) { - if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); - if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); - } - } - } break; case N_STRING: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - StrNode* sn = &NSTRING(node); - UChar* p = sn->s; - - while (p < sn->end) { - if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { - NSTRING_SET_CASE_AMBIG(node); - break; - } - p += enc_len(reg->enc, *p); - } + r = divide_ambig_string_node(node, reg); } break; @@ -3067,9 +3125,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) Node* target = qn->target; if ((state & IN_REPEAT) != 0) { - qn->state |= NST_IN_REPEAT; + qn->state |= NST_IN_REPEAT; } - + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { r = get_min_match_length(target, &d, env); if (r) break; @@ -3096,8 +3154,9 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) } } + state |= IN_REPEAT; if (qn->lower != qn->upper) - state |= IN_REPEAT; + state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; @@ -3154,11 +3213,13 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) break; case EFFECT_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ } - /* fall */ + r = setup_tree(en->target, reg, state, env); + break; + case EFFECT_STOP_BACKTRACK: { Node* target = en->target; @@ -3169,7 +3230,7 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) tqn->greedy != 0) { /* (?>a*), a*+ etc... */ int qtype = NTYPE(tqn->target); if (IS_NODE_TYPE_SIMPLE(qtype)) - SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); + SET_EFFECT_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); } } } @@ -3241,26 +3302,17 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) /* set skip map for Boyer-Moor search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, +set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, UChar skip[], int** int_skip) { int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - skip[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + skip[s[i]] = len - 1 - i; } else { if (IS_NULL(*int_skip)) { @@ -3269,16 +3321,8 @@ set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*int_skip)[*lowbuf] = len - 1 - i; - } - } - else { - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = len - 1 - i; - } + for (i = 0; i < len - 1; i++) + (*int_skip)[s[i]] = len - 1 - i; } return 0; } @@ -3291,11 +3335,12 @@ typedef struct { } MinMaxLen; typedef struct { - MinMaxLen mmd; - BitStatusType backrefed_status; - OnigEncoding enc; - OnigOptionType options; - ScanEnv* scan_env; + MinMaxLen mmd; + BitStatusType backrefed_status; + OnigEncoding enc; + OnigOptionType options; + OnigAmbigType ambig_flag; + ScanEnv* scan_env; } OptEnv; typedef struct { @@ -3332,31 +3377,31 @@ typedef struct { OptMapInfo map; /* boundary */ } NodeOptInfo; +static short int ByteValTable[] = { + 14, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 +}; static int map_position_value(int i) { - static int vals[] = { - 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, - }; - - if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; - - return 7; /* Take it easy. */ + if (i < sizeof(ByteValTable)/sizeof(ByteValTable[0])) + return (int )ByteValTable[i]; + else + return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static int dist_vals[] = { + static short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, @@ -3376,7 +3421,7 @@ distance_value(MinMaxLen* mm) d = mm->max - mm->min; if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) /* return dist_vals[d] * 16 / (mm->min + 12); */ - return dist_vals[d]; + return (int )dist_vals[d]; else return 1; } @@ -3432,12 +3477,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from) to->max = distance_add(to->max, from->max); } +#if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } +#endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) @@ -3584,7 +3631,7 @@ concat_opt_exact_info_str(OptExactInfo* to, to->s[i++] = *p++; } else { - len = enc_len(enc, *p); + len = enc_len(enc, p); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len; j++) to->s[i++] = *p++; @@ -3611,7 +3658,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; - len = enc_len(env->enc, to->s[i]); + len = enc_len(env->enc, to->s + i); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; @@ -3633,12 +3680,24 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) static void select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) { - int vlen1, vlen2; + int v1, v2; + + v1 = now->len; + v2 = alt->len; - vlen1 = now->len * (now->ignore_case ? 1 : 2); - vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + if (v1 <= 2 && v2 <= 2) { + /* ByteValTable[x] is big value --> low price */ + v2 = map_position_value(now->s[0]); + v1 = map_position_value(alt->s[0]); - if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + if (now->len > 1) v1 += 5; + if (alt->len > 1) v2 += 5; + } + + if (now->ignore_case == 0) v1 *= 2; + if (alt->ignore_case == 0) v2 *= 2; + + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } @@ -3661,7 +3720,7 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) } static void -add_char_opt_map_info(OptMapInfo* map, int c) +add_char_opt_map_info(OptMapInfo* map, UChar c) { if (map->map[c] == 0) { map->map[c] = 1; @@ -3669,26 +3728,48 @@ add_char_opt_map_info(OptMapInfo* map, int c) } } -static void -add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +static int +add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, + OnigEncoding enc, OnigAmbigType ambig_flag) { - UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + int i, j, n, len; + UChar buf[ONIGENC_MBC_NORMALIZE_MAXLEN]; + OnigCodePoint code, ccode; + OnigCompAmbigCodes* ccs; + OnigPairAmbigCodes* pccs; + OnigAmbigType amb; - add_char_opt_map_info(map, c); + add_char_opt_map_info(map, p[0]); + code = ONIGENC_MBC_TO_CODE(enc, p, end); - x = (UChar )c; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if (*low != x) { - add_char_opt_map_info(map, (int )(*low)); - } - else { - int i; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { - x = (UChar )i; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if ((int )(*low) == c) add_char_opt_map_info(map, i); + for (amb = 0x01; amb <= ONIGENC_AMBIGUOUS_MATCH_LIMIT; amb <<= 1) { + if ((amb & ambig_flag) == 0) continue; + + n = ONIGENC_GET_ALL_PAIR_AMBIG_CODES(enc, amb, &pccs); + for (i = 0; i < n; i++) { + if (pccs[i].from == code) { + len = ONIGENC_CODE_TO_MBC(enc, pccs[i].to, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + } + + if ((ambig_flag & ONIGENC_AMBIGUOUS_MATCH_COMPOUND) != 0) { + n = ONIGENC_GET_ALL_COMP_AMBIG_CODES(enc, amb, &ccs); + for (i = 0; i < n; i++) { + if (ccs[i].code == code) { + for (j = 0; j < ccs[i].n; j++) { + ccode = ccs[i].items[j].code[0]; + len = ONIGENC_CODE_TO_MBC(enc, ccode, buf); + if (len < 0) return len; + add_char_opt_map_info(map, buf[0]); + } + break; + } + } } } + return 0; } static void @@ -3894,143 +3975,110 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) case N_STRING: { - UChar *p; - int len, plen; StrNode* sn = &(NSTRING(node)); int slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); - if ((! IS_IGNORECASE(env->options)) || is_raw) { + if (! NSTRING_IS_AMBIG(node)) { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, NSTRING_IS_RAW(node), env->enc); if (slen > 0) { add_char_opt_map_info(&opt->map, *(sn->s)); } + set_mml(&opt->len, slen, slen); } else { - for (p = sn->s; p < sn->end; ) { - len = enc_len(env->enc, *p); - if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { - break; - } - p += len; - } + int n, max; - plen = p - sn->s; - if (plen > slen / 5) { - concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); - concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); - opt->exm.ignore_case = 1; - if (opt->exm.len == sn->end - p) - opt->exm.reach_end = 1; - - copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); - add_len_mml(&(opt->exm.mmd), plen); - } - else { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - is_raw, env->enc); - opt->exb.ignore_case = 1; - } + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; if (slen > 0) { - if (p == sn->s) - add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); - else - add_char_opt_map_info(&opt->map, *(sn->s)); + r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, + env->enc, env->ambig_flag); + if (r != 0) break; } + + if (NSTRING_IS_AMBIG_REDUCE(node)) { + n = onigenc_strlen(env->enc, sn->s, sn->end); + max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; + } + else { + max = slen; + } + set_mml(&opt->len, slen, max); } if (opt->exb.len == slen) opt->exb.reach_end = 1; - - set_mml(&opt->len, slen, slen); } break; case N_CCLASS: { - int i, z, len, found, mb_found; + int i, z; CClassNode* cc = &(NCCLASS(node)); /* no need to check ignore case. (setted in setup_tree()) */ - found = mb_found = 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !cc->not) || (!z && cc->not)) { - found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - if (! ONIGENC_IS_SINGLEBYTE(env->enc)) { - if (! IS_NULL(cc->mbuf) || - (cc->not != 0 && found != 0)) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = ONIGENC_IS_MBC_HEAD(env->enc, i); - if (z) { - mb_found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - } - } + if (IS_NOT_NULL(cc->mbuf) || cc->not != 0) { + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - if (mb_found) { - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + set_mml(&opt->len, min, max); } - else if (found) { - len = 1; - set_mml(&opt->len, 1, len); + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !cc->not) || (!z && cc->not)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + set_mml(&opt->len, 1, 1); } } break; case N_CTYPE: { - int c; - int len, min, max; + int i, min, max; - min = ONIGENC_MBC_MAXLEN_DIST(env->enc); - max = 0; + max = ONIGENC_MBC_MAXLEN_DIST(env->enc); -#define IS_WORD_HEAD_BYTE(enc,b) \ - (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ - : ONIGENC_IS_MBC_HEAD(enc,b)) + if (max == 1) { + min = 1; - switch (NCTYPE(node).type) { - case CTYPE_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; + switch (NCTYPE(node).type) { + case CTYPE_NOT_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; - case CTYPE_NOT_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } + case CTYPE_WORD: + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i); + } + } + break; } - break; } - + else { + min = ONIGENC_MBC_MINLEN(env->enc); + } set_mml(&opt->len, min, max); } break; case N_ANYCHAR: { - OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, min, max); } break; @@ -4231,36 +4279,20 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->len == 0) return 0; - reg->exact = onig_strdup(e->s, e->s + e->len); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - - reg->exact_end = reg->exact + e->len; - if (e->ignore_case) { - UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int len, low_len, i, j, alloc_size; - - alloc_size = e->len; - i = j = 0; - while (i < e->len) { - low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); - len = enc_len(reg->enc, e->s[i]); - if (low_len > alloc_size - i) { - reg->exact = xrealloc(reg->exact, alloc_size * 2); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - alloc_size *= 2; - } - - xmemcpy(&(reg->exact[j]), buf, low_len); - i += len; - j += low_len; - } - reg->exact_end = reg->exact + j; + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + xmemcpy(reg->exact, e->s, e->len); + reg->exact_end = reg->exact + e->len; reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } else { int allow_reverse; + reg->exact = onig_strdup(e->s, e->s + e->len); + CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); + reg->exact_end = reg->exact + e->len; + if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) allow_reverse = 1; else @@ -4268,7 +4300,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, + r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, reg->map, &(reg->int_map)); if (r) return r; @@ -4328,6 +4360,7 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) env.enc = reg->enc; env.options = reg->options; + env.ambig_flag = reg->ambig_flag; env.scan_env = scan_env; clear_mml(&env.mmd); @@ -4482,17 +4515,26 @@ print_optimize_info(FILE* f, regex_t* reg) fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { - int i, n = 0; + int c, i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { + c = 0; fputc('[', f); - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - if (reg->map[i] && enc_len(reg->enc, i) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, i)) - fputc(i, f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (reg->map[i] != 0) { + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } + } fprintf(f, "]\n"); } } @@ -4531,7 +4573,7 @@ onig_free(regex_t* reg) xfree(from);\ } while (0) -static void +extern void onig_transfer(regex_t* to, regex_t* from) { THREAD_ATOMIC_START; @@ -4545,7 +4587,7 @@ onig_transfer(regex_t* to, regex_t* from) }\ } while (0) -static void +extern void onig_chain_link_add(regex_t* to, regex_t* add) { THREAD_ATOMIC_START; @@ -4598,7 +4640,8 @@ onig_clone(regex_t** to, regex_t* from) from->state++; /* increment as search counter */ } - r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); + r = onig_alloc_init(®, ONIG_OPTION_NONE, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + from->enc, ONIG_SYNTAX_DEFAULT); if (r != 0) { from->state--; return r; @@ -4829,8 +4872,8 @@ onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, static int onig_inited = 0; extern int -onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax) +onig_alloc_init(regex_t** reg, OnigOptionType option, OnigAmbigType ambig_flag, + OnigEncoding enc, OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); @@ -4863,6 +4906,9 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, (*reg)->used = 0; (*reg)->name_table = (void* )NULL; + (*reg)->ambig_flag = ambig_flag; + (*reg)->ambig_flag &= ONIGENC_SUPPORT_AMBIG_FLAG(enc); + return 0; } @@ -4875,7 +4921,8 @@ onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; - r = onig_alloc_init(reg, option, enc, syntax); + r = onig_alloc_init(reg, option, ONIGENC_AMBIGUOUS_MATCH_DEFAULT, + enc, syntax); if (r) return r; r = onig_compile(*reg, pattern, pattern_end, einfo); @@ -4971,7 +5018,7 @@ OnigOpInfoType OnigOpInfo[] = { { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREF3, "backref3", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, @@ -4992,6 +5039,8 @@ OnigOpInfoType OnigOpInfo[] = { { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, @@ -5058,7 +5107,8 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) } extern void -onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, + OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; @@ -5150,7 +5200,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case OP_EXACT1_IC: - p_string(f, 1, bp++); + len = enc_len(enc, bp); + p_string(f, len, bp); + bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); @@ -5196,8 +5248,14 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_BACKREF_MULTI: + case OP_BACKREFN_IC: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { @@ -5265,7 +5323,7 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) else fputs(" ", f); } - onig_print_compiled_byte_code(f, bp, &bp); + onig_print_compiled_byte_code(f, bp, &bp, reg->enc); } fprintf(f, "\n"); @@ -5325,12 +5383,6 @@ print_indent_tree(FILE* f, Node* node, int indent) fprintf(f, "%0x", bbuf->p[i]); } } -#if 0 - fprintf(f, "\n"); - Indent(f, indent); - for (i = 0; i < SINGLE_BYTE_SIZE; i++) - fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); -#endif break; case N_CTYPE: -- cgit v1.2.3