diff options
Diffstat (limited to 'regcomp.c')
| -rw-r--r-- | regcomp.c | 5413 |
1 files changed, 3362 insertions, 2051 deletions
@@ -1,21 +1,95 @@ /********************************************************************** + regcomp.c - Onigmo (Oniguruma-mod) (regular expression library) +**********************************************************************/ +/*- + * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ - regcomp.c - Oniguruma (regular expression library) +#include "regparse.h" - Copyright (C) 2002-2004 K.Kosako (kosako@sofnec.co.jp) +OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; + +extern OnigCaseFoldType +onig_get_default_case_fold_flag(void) +{ + return OnigDefaultCaseFoldFlag; +} + +extern int +onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) +{ + OnigDefaultCaseFoldFlag = case_fold_flag; + return 0; +} -**********************************************************************/ -#include "regparse.h" #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif +#if 0 +static UChar* +str_dup(UChar* s, UChar* end) +{ + ptrdiff_t len = end - s; + + if (len > 0) { + UChar* r = (UChar* )xmalloc(len + 1); + CHECK_NULL_RETURN(r); + xmemcpy(r, s, len); + r[len] = (UChar )0; + return r; + } + else return NULL; +} +#endif + static void swap_node(Node* a, Node* b) { Node c; c = *a; *a = *b; *b = c; + + if (NTYPE(a) == NT_STR) { + StrNode* sn = NSTR(a); + if (sn->capa == 0) { + size_t len = sn->end - sn->s; + sn->s = sn->buf; + sn->end = sn->s + len; + } + } + + if (NTYPE(b) == NT_STR) { + StrNode* sn = NSTR(b); + if (sn->capa == 0) { + size_t len = sn->end - sn->s; + sn->s = sn->buf; + sn->end = sn->s + len; + } + } } static OnigDistance @@ -64,13 +138,40 @@ bitset_on_num(BitSetRef bs) } #endif +// Attempt to right size allocated buffers for a regex post compile +static void +onig_reg_resize(regex_t *reg) +{ + do { + if (!reg->used) { + xfree(reg->p); + reg->alloc = 0; + reg->p = 0; + } + else if (reg->alloc > reg->used) { + unsigned char *new_ptr = xrealloc(reg->p, reg->used); + // Skip the right size optimization if memory allocation fails + if (new_ptr) { + reg->alloc = reg->used; + reg->p = new_ptr; + } + } + } while ((reg = reg->chain) != 0); +} + extern int -onig_bbuf_init(BBuf* buf, int size) +onig_bbuf_init(BBuf* buf, OnigDistance size) { - buf->p = (UChar* )xmalloc(size); - if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); + if (size <= 0) { + size = 0; + buf->p = NULL; + } + else { + buf->p = (UChar* )xmalloc(size); + if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); + } - buf->alloc = size; + buf->alloc = (unsigned int )size; buf->used = 0; return 0; } @@ -84,7 +185,7 @@ unset_addr_list_init(UnsetAddrList* uslist, int size) UnsetAddr* p; p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size); - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(p); uslist->num = 0; uslist->alloc = size; uslist->us = p; @@ -94,8 +195,7 @@ unset_addr_list_init(UnsetAddrList* uslist, int size) static void unset_addr_list_end(UnsetAddrList* uslist) { - if (IS_NOT_NULL(uslist->us)) - xfree(uslist->us); + xfree(uslist->us); } static int @@ -107,7 +207,7 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) if (uslist->num >= uslist->alloc) { size = uslist->alloc * 2; p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size); - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(p); uslist->alloc = size; uslist->us = p; } @@ -120,52 +220,30 @@ unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) #endif /* USE_SUBEXP_CALL */ -#if 0 static int -bitset_mbmaxlen(BitSetRef bs, int negative, OnigEncoding enc) +add_opcode(regex_t* reg, int opcode) { - int i; - int len, maxlen = 0; - - if (negative) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - else { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(bs, i)) { - len = enc_len(enc, i); - if (len > maxlen) maxlen = len; - } - } - } - return maxlen; + BBUF_ADD1(reg, opcode); + return 0; } -#endif +#ifdef USE_COMBINATION_EXPLOSION_CHECK static int -add_opcode(regex_t* reg, int opcode) +add_state_check_num(regex_t* reg, int num) { - BBUF_ADD1(reg, opcode); + StateCheckNumType n = (StateCheckNumType )num; + + BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM); return 0; } +#endif static int add_rel_addr(regex_t* reg, int addr) { RelAddrType ra = (RelAddrType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &ra, SIZE_RELADDR); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_RELADDR(ra, buf); - BBUF_ADD(reg, buf, SIZE_RELADDR); -#endif return 0; } @@ -174,28 +252,16 @@ add_abs_addr(regex_t* reg, int addr) { AbsAddrType ra = (AbsAddrType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &ra, SIZE_ABSADDR); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_ABSADDR(ra, buf); - BBUF_ADD(reg, buf, SIZE_ABSADDR); -#endif return 0; } static int -add_length(regex_t* reg, int len) +add_length(regex_t* reg, OnigDistance len) { LengthType l = (LengthType )len; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &l, SIZE_LENGTH); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_LENGTH(l, buf); - BBUF_ADD(reg, buf, SIZE_LENGTH); -#endif return 0; } @@ -204,29 +270,17 @@ add_mem_num(regex_t* reg, int num) { MemNumType n = (MemNumType )num; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &n, SIZE_MEMNUM); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_MEMNUM(n, buf); - BBUF_ADD(reg, buf, SIZE_MEMNUM); -#endif return 0; } #if 0 static int -add_repeat_num(regex_t* reg, int num) +add_pointer(regex_t* reg, void* addr) { - RepeatNumType n = (RepeatNumType )num; + PointerType ptr = (PointerType )addr; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS - BBUF_ADD(reg, &n, SIZE_REPEATNUM); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_REPEATNUM(n, buf); - BBUF_ADD(reg, buf, SIZE_REPEATNUM); -#endif + BBUF_ADD(reg, &ptr, SIZE_POINTER); return 0; } #endif @@ -234,13 +288,7 @@ add_repeat_num(regex_t* reg, int num) static int add_option(regex_t* reg, OnigOptionType option) { -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_ADD(reg, &option, SIZE_OPTION); -#else - UChar buf[SERIALIZE_BUFSIZE]; - SERIALIZE_OPTION(option, buf); - BBUF_ADD(reg, buf, SIZE_OPTION); -#endif return 0; } @@ -256,7 +304,7 @@ add_opcode_rel_addr(regex_t* reg, int opcode, int addr) } static int -add_bytes(regex_t* reg, UChar* bytes, int len) +add_bytes(regex_t* reg, UChar* bytes, OnigDistance len) { BBUF_ADD(reg, bytes, len); return 0; @@ -289,19 +337,20 @@ static int compile_tree(Node* node, regex_t* reg); (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) static int -select_str_opcode(int mb_len, int str_len, int ignore_case) +select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case) { int op; + OnigDistance str_len = roomof(byte_len, mb_len); - switch (mb_len) { - case 1: - if (ignore_case) { - switch (str_len) { - case 1: op = OP_EXACT1_IC; break; - default: op = OP_EXACTN_IC; break; - } + if (ignore_case) { + switch (str_len) { + case 1: op = OP_EXACT1_IC; break; + default: op = OP_EXACTN_IC; break; } - else { + } + else { + switch (mb_len) { + case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; @@ -310,25 +359,25 @@ select_str_opcode(int mb_len, int str_len, int ignore_case) case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } - } - break; + break; - case 2: - switch (str_len) { - case 1: op = OP_EXACTMB2N1; break; - case 2: op = OP_EXACTMB2N2; break; - case 3: op = OP_EXACTMB2N3; break; - default: op = OP_EXACTMB2N; break; - } - break; + case 2: + switch (str_len) { + case 1: op = OP_EXACTMB2N1; break; + case 2: op = OP_EXACTMB2N2; break; + case 3: op = OP_EXACTMB2N3; break; + default: op = OP_EXACTMB2N; break; + } + break; - case 3: - op = OP_EXACTMB3N; - break; + case 3: + op = OP_EXACTMB3N; + break; - default: - op = OP_EXACTMBN; - break; + default: + op = OP_EXACTMBN; + break; + } } return op; } @@ -345,6 +394,7 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info) r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */ if (r) return r; reg->num_null_check++; + if ((MemNumType)reg->num_null_check <= 0) return ONIGERR_TOO_MANY_NULL_CHECK; } r = compile_tree(node, reg); @@ -373,7 +423,7 @@ compile_call(CallNode* node, regex_t* reg) r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), - node->target); + node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; @@ -393,88 +443,79 @@ compile_tree_n_times(Node* node, int n, regex_t* reg) } static int -add_compile_string_length(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len, + regex_t* reg ARG_UNUSED, int ignore_case) { int len; - int op = select_str_opcode(mb_len, str_len, ignore_case); + int op = select_str_opcode(mb_len, byte_len, ignore_case); len = SIZE_OPCODE; - if (op == OP_EXACTMBN) - len += SIZE_LENGTH; + if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; - len += mb_len * str_len; + len += (int )byte_len; return len; } static int -add_compile_string(UChar* s, int mb_len, int str_len, - regex_t* reg, int ignore_case) +add_compile_string(UChar* s, int mb_len, OnigDistance byte_len, + regex_t* reg, int ignore_case) { - int op = select_str_opcode(mb_len, str_len, ignore_case); + int op = select_str_opcode(mb_len, byte_len, ignore_case); add_opcode(reg, op); if (op == OP_EXACTMBN) add_length(reg, mb_len); - if (IS_NEED_STR_LEN_OP_EXACT(op)) - add_length(reg, str_len); + if (IS_NEED_STR_LEN_OP_EXACT(op)) { + if (op == OP_EXACTN_IC) + add_length(reg, byte_len); + else + add_length(reg, byte_len / mb_len); + } - add_bytes(reg, s, mb_len * str_len); + add_bytes(reg, s, byte_len); return 0; } static int -compile_length_string_node(StrNode* sn, regex_t* reg) +compile_length_string_node(Node* node, regex_t* reg) { - int rlen, r, len, prev_len, slen, ambig, ic; + int rlen, r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; + StrNode* sn; + sn = NSTR(node); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - + prev_len = enclen(enc, p, sn->end); p += prev_len; - slen = 1; + blen = prev_len; rlen = 0; for (; p < sn->end; ) { - len = enc_len(enc, *p); - if (len == prev_len) { - slen++; - if (ic != 0 && ambig == 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); + len = enclen(enc, p, sn->end); + if (len == prev_len || ambig) { + blen += len; } else { - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; - - if (ic != 0 && len == 1) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - else - ambig = 0; - prev = p; - slen = 1; + blen = len; prev_len = len; } - p += len; } - r = add_compile_string_length(prev, prev_len, slen, reg, ambig); + r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; return rlen; } @@ -489,58 +530,42 @@ compile_length_string_raw_node(StrNode* sn, regex_t* reg) } static int -compile_string_node(StrNode* sn, regex_t* reg) +compile_string_node(Node* node, regex_t* reg) { - int r, len, prev_len, slen, ambig, ic; + int r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; - UChar *p, *prev; + UChar *p, *prev, *end; + StrNode* sn; + sn = NSTR(node); if (sn->end <= sn->s) return 0; - ic = IS_IGNORECASE(reg->options); + end = sn->end; + ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; - prev_len = enc_len(enc, *p); - if (ic != 0 && prev_len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) - ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; - + prev_len = enclen(enc, p, end); p += prev_len; - slen = 1; + blen = prev_len; - for (; p < sn->end; ) { - len = enc_len(enc, *p); - if (len == prev_len) { - slen++; - if (ic != 0 && len == 1) { - if (ambig == 0) - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } + for (; p < end; ) { + len = enclen(enc, p, end); + if (len == prev_len || ambig) { + blen += len; } else { - r = add_compile_string(prev, prev_len, slen, reg, ambig); + r = add_compile_string(prev, prev_len, blen, reg, ambig); if (r) return r; - if (ic != 0 && len == 1) { - ambig = ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p); - if (ambig != 0) ONIGENC_MBC_TO_LOWER(reg->enc, p, p); - } - else - ambig = 0; prev = p; - slen = 1; + blen = len; prev_len = len; } p += len; } - return add_compile_string(prev, prev_len, slen, reg, ambig); + return add_compile_string(prev, prev_len, blen, reg, ambig); } static int @@ -584,8 +609,7 @@ compile_length_cclass_node(CClassNode* cc, regex_t* reg) len = SIZE_OPCODE + SIZE_BITSET; } else { - if (bitset_is_empty(cc->bs)) { - /* SIZE_BITSET is included in mbuf->used. */ + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { @@ -607,21 +631,27 @@ compile_cclass_node(CClassNode* cc, regex_t* reg) int r; if (IS_NULL(cc->mbuf)) { - if (cc->not) add_opcode(reg, OP_CCLASS_NOT); - else add_opcode(reg, OP_CCLASS); + if (IS_NCCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_NOT); + else + add_opcode(reg, OP_CCLASS); r = add_bitset(reg, cc->bs); } else { - if (bitset_is_empty(cc->bs)) { - if (cc->not) add_opcode(reg, OP_CCLASS_MB_NOT); - else add_opcode(reg, OP_CCLASS_MB); + if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { + if (IS_NCCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_MB_NOT); + else + add_opcode(reg, OP_CCLASS_MB); r = add_multi_byte_cclass(cc->mbuf, reg); } else { - if (cc->not) add_opcode(reg, OP_CCLASS_MIX_NOT); - else add_opcode(reg, OP_CCLASS_MIX); + if (IS_NCCLASS_NOT(cc)) + add_opcode(reg, OP_CCLASS_MIX_NOT); + else + add_opcode(reg, OP_CCLASS_MIX); r = add_bitset(reg, cc->bs); if (r) return r; @@ -641,7 +671,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) if (reg->repeat_range_alloc == 0) { p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; } @@ -649,8 +679,8 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, - sizeof(OnigRepeatRange) * n); - CHECK_NULL_RETURN_VAL(p, ONIGERR_MEMORY); + sizeof(OnigRepeatRange) * n); + CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; } @@ -659,13 +689,13 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = upper; + p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); return 0; } static int -compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, - regex_t* reg) +compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info, + regex_t* reg) { int r; int num_repeat = reg->num_repeat; @@ -674,6 +704,7 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ reg->num_repeat++; + if ((MemNumType)reg->num_repeat <= 0) return ONIGERR_TOO_MANY_RANGE_REPEAT; if (r) return r; r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC); if (r) return r; @@ -684,16 +715,272 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; - r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + if ( +#ifdef USE_SUBEXP_CALL + reg->num_call > 0 || +#endif + IS_QUANTIFIER_IN_REPEAT(qn)) { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); + } + else { + r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); + } if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; } -#define QUALIFIER_EXPAND_LIMIT_SIZE 50 +static int +is_anychar_star_quantifier(QtfrNode* qn) +{ + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + NTYPE(qn->target) == NT_CANY) + return 1; + else + return 0; +} + +#define QUANTIFIER_EXPAND_LIMIT_SIZE 50 +#define CKN_ON (ckn > 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +static int +compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) +{ + int len, mod_tlen, cklen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0); + + /* anychar repeat */ + if (NTYPE(qn->target) == NT_CANY) { + if (qn->greedy && infinite) { + if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; + else + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) + len = SIZE_OP_JUMP; + else + len = 0; + + len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP; + } + else { + if (qn->lower == 0) + len = SIZE_OP_JUMP; + else + len = 0; + + len += mod_tlen + SIZE_OP_PUSH + cklen; + } + } + else if (qn->upper == 0) { + if (qn->is_referred != 0) /* /(?<n>..){0}/ */ + len = SIZE_OP_JUMP + tlen; + else + len = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + len = SIZE_OP_STATE_CHECK_PUSH + tlen; + } + else { + len = SIZE_OP_PUSH + tlen; + } + } + else { + len = tlen; + } + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen; + } + else { + len = SIZE_OP_REPEAT_INC + + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; + if (CKN_ON) + len += SIZE_OP_STATE_CHECK; + } + + return len; +} + +static int +compile_quantifier_node(QtfrNode* qn, regex_t* reg) +{ + int r, mod_tlen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + if (is_anychar_star_quantifier(qn)) { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) { + if (IS_MULTILINE(reg->options)) + r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); + else + r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); + if (r) return r; + if (CKN_ON) { + r = add_state_check_num(reg, ckn); + if (r) return r; + } + + return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); + } + else { + if (IS_MULTILINE(reg->options)) { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_ML_STAR + : OP_ANYCHAR_ML_STAR)); + } + else { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_STAR + : OP_ANYCHAR_STAR)); + } + if (r) return r; + if (CKN_ON) + r = add_state_check_num(reg, ckn); + + return r; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) { + r = add_opcode_rel_addr(reg, OP_JUMP, + (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)); + if (r) return r; + } + + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + } + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH))); + } + else { + if (qn->lower == 0) { + r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); + if (r) return r; + } + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, + -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP)); + } + else + r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); + } + } + else if (qn->upper == 0) { + if (qn->is_referred != 0) { /* /(?<n>..){0}/ */ + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else + r = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, tlen); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, tlen); + } + if (r) return r; + } + + r = compile_tree(qn->target, reg); + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); + } + + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else { + r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); + if (CKN_ON) { + if (r) return r; + r = add_opcode(reg, OP_STATE_CHECK); + if (r) return r; + r = add_state_check_num(reg, ckn); + } + } + return r; +} + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ static int -compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) +compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) { int len, mod_tlen; int infinite = IS_REPEAT_INFINITE(qn->upper); @@ -703,12 +990,12 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) if (tlen < 0) return tlen; /* anychar repeat */ - if (NTYPE(qn->target) == N_ANYCHAR) { + if (NTYPE(qn->target) == NT_CANY) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) - return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else - return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } @@ -718,8 +1005,8 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) mod_tlen = tlen; if (infinite && - (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { - if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { len = SIZE_OP_JUMP; } else { @@ -727,21 +1014,25 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) } if (qn->greedy) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) - len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; - else if (IS_NOT_NULL(qn->next_head_exact)) - len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; + len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) + len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; else - len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; + len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; } else len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; } - else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */ + else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */ len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } @@ -757,17 +1048,7 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) } static int -is_anychar_star_qualifier(QualifierNode* qn) -{ - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && - NTYPE(qn->target) == N_ANYCHAR) - return 1; - else - return 0; -} - -static int -compile_qualifier_node(QualifierNode* qn, regex_t* reg) +compile_quantifier_node(QtfrNode* qn, regex_t* reg) { int i, r, mod_tlen; int infinite = IS_REPEAT_INFINITE(qn->upper); @@ -776,22 +1057,22 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) if (tlen < 0) return tlen; - if (is_anychar_star_qualifier(qn)) { + if (is_anychar_star_quantifier(qn)) { r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; if (IS_NOT_NULL(qn->next_head_exact)) { if (IS_MULTILINE(reg->options)) - r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); + r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); else - r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); + r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); if (r) return r; - return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); } else { if (IS_MULTILINE(reg->options)) - return add_opcode(reg, OP_ANYCHAR_ML_STAR); + return add_opcode(reg, OP_ANYCHAR_ML_STAR); else - return add_opcode(reg, OP_ANYCHAR_STAR); + return add_opcode(reg, OP_ANYCHAR_STAR); } } @@ -801,18 +1082,21 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) mod_tlen = tlen; if (infinite && - (qn->lower <= 1 || tlen * qn->lower <= QUALIFIER_EXPAND_LIMIT_SIZE)) { - if (qn->lower == 1 && tlen > QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { + if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { if (qn->greedy) { - if (IS_NOT_NULL(qn->head_exact)) - r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); - else if (IS_NOT_NULL(qn->next_head_exact)) - r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); - else - r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); +#ifdef USE_OP_PUSH_OR_JUMP_EXACT + if (IS_NOT_NULL(qn->head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); + else + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); } else { - r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP); + r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP); } if (r) return r; } @@ -822,33 +1106,36 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) } if (qn->greedy) { +#ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) { - r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, - mod_tlen + SIZE_OP_JUMP); - if (r) return r; - add_bytes(reg, NSTRING(qn->head_exact).s, 1); - r = compile_tree_empty_check(qn->target, reg, empty_info); - if (r) return r; - r = add_opcode_rel_addr(reg, OP_JUMP, - -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1)); + r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTR(qn->head_exact)->s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1)); } - else if (IS_NOT_NULL(qn->next_head_exact)) { - r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, - mod_tlen + SIZE_OP_JUMP); - if (r) return r; - add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); - r = compile_tree_empty_check(qn->target, reg, empty_info); - if (r) return r; - r = add_opcode_rel_addr(reg, OP_JUMP, + else +#endif + if (IS_NOT_NULL(qn->next_head_exact)) { + r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, + mod_tlen + SIZE_OP_JUMP); + if (r) return r; + add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT)); } else { - r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); - if (r) return r; - r = compile_tree_empty_check(qn->target, reg, empty_info); - if (r) return r; - r = add_opcode_rel_addr(reg, OP_JUMP, - -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH)); + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH)); } } else { @@ -859,13 +1146,14 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); } } - else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?<n>..){0}/ */ + else if (qn->upper == 0 && qn->is_referred != 0) { /* /(?<n>..){0}/ */ r = add_opcode_rel_addr(reg, OP_JUMP, tlen); if (r) return r; r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && - (tlen + SIZE_OP_PUSH) * qn->upper <= QUALIFIER_EXPAND_LIMIT_SIZE) { + (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper + <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); @@ -873,7 +1161,7 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) for (i = 0; i < n; i++) { r = add_opcode_rel_addr(reg, OP_PUSH, - (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH); + (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH); if (r) return r; r = compile_tree(qn->target, reg); if (r) return r; @@ -891,9 +1179,10 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) } return r; } +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ static int -compile_length_option_node(EffectNode* node, regex_t* reg) +compile_length_option_node(EncloseNode* node, regex_t* reg) { int tlen; OnigOptionType prev = reg->options; @@ -913,7 +1202,7 @@ compile_length_option_node(EffectNode* node, regex_t* reg) } static int -compile_option_node(EffectNode* node, regex_t* reg) +compile_option_node(EncloseNode* node, regex_t* reg) { int r; OnigOptionType prev = reg->options; @@ -925,28 +1214,26 @@ compile_option_node(EffectNode* node, regex_t* reg) if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; + } - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; + reg->options = node->option; + r = compile_tree(node->target, reg); + reg->options = prev; + + if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } - else { - reg->options = node->option; - r = compile_tree(node->target, reg); - reg->options = prev; - } return r; } static int -compile_length_effect_node(EffectNode* node, regex_t* reg) +compile_length_enclose_node(EncloseNode* node, regex_t* reg) { int len; int tlen; - if (node->type == EFFECT_OPTION) + if (node->type == ENCLOSE_OPTION) return compile_length_option_node(node, reg); if (node->target) { @@ -957,43 +1244,79 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) tlen = 0; switch (node->type) { - case EFFECT_MEMORY: + case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL - if (IS_EFFECT_CALLED(node)) { + if (IS_ENCLOSE_CALLED(node)) { len = SIZE_OP_MEMORY_START_PUSH + tlen - + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; + + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) - len += (IS_EFFECT_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + len += (IS_ENCLOSE_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); else - len += (IS_EFFECT_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (IS_ENCLOSE_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + } + else if (IS_ENCLOSE_RECURSION(node)) { + len = SIZE_OP_MEMORY_START_PUSH; + len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); } else #endif { if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum)) - len = SIZE_OP_MEMORY_START_PUSH; + len = SIZE_OP_MEMORY_START_PUSH; else - len = SIZE_OP_MEMORY_START; + len = SIZE_OP_MEMORY_START; len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) - ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); + ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); } break; - case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { - QualifierNode* qn = &NQUALIFIER(node->target); + case ENCLOSE_STOP_BACKTRACK: + /* Disable POP_STOP_BT optimization for simple repeat under the match cache */ + /* optimization because the match cache optimization pushes an extra item to */ + /* the stack and it breaks the assumption for this optimization. */ +#ifndef USE_MATCH_CACHE + if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) { + QtfrNode* qn = NQTFR(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; len = tlen * qn->lower - + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP; + + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP; } else { +#endif len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT; +#ifndef USE_MATCH_CACHE } +#endif + break; + + case ENCLOSE_CONDITION: + len = SIZE_OP_CONDITION; + if (NTYPE(node->target) == NT_ALT) { + Node* x = node->target; + + tlen = compile_length_tree(NCAR(x), reg); /* yes-node */ + if (tlen < 0) return tlen; + len += tlen + SIZE_OP_JUMP; + if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; + x = NCDR(x); + tlen = compile_length_tree(NCAR(x), reg); /* no-node */ + if (tlen < 0) return tlen; + len += tlen; + if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; + } + else { + return ONIGERR_PARSER_BUG; + } + break; + + case ENCLOSE_ABSENT: + len = SIZE_OP_PUSH_ABSENT_POS + SIZE_OP_ABSENT + tlen + SIZE_OP_ABSENT_END; break; default: @@ -1007,17 +1330,17 @@ compile_length_effect_node(EffectNode* node, regex_t* reg) static int get_char_length_tree(Node* node, regex_t* reg, int* len); static int -compile_effect_node(EffectNode* node, regex_t* reg) +compile_enclose_node(EncloseNode* node, regex_t* reg) { int r, len; - if (node->type == EFFECT_OPTION) + if (node->type == ENCLOSE_OPTION) return compile_option_node(node, reg); switch (node->type) { - case EFFECT_MEMORY: + case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL - if (IS_EFFECT_CALLED(node)) { + if (IS_ENCLOSE_CALLED(node)) { r = add_opcode(reg, OP_CALL); if (r) return r; node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; @@ -1027,11 +1350,11 @@ compile_effect_node(EffectNode* node, regex_t* reg) len = compile_length_tree(node->target, reg); len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) - len += (IS_EFFECT_RECURSION(node) - ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); + len += (IS_ENCLOSE_RECURSION(node) + ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); else - len += (IS_EFFECT_RECURSION(node) - ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); + len += (IS_ENCLOSE_RECURSION(node) + ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); r = add_opcode_rel_addr(reg, OP_JUMP, len); if (r) return r; @@ -1047,34 +1370,46 @@ compile_effect_node(EffectNode* node, regex_t* reg) r = compile_tree(node->target, reg); if (r) return r; #ifdef USE_SUBEXP_CALL - if (IS_EFFECT_CALLED(node)) { + if (IS_ENCLOSE_CALLED(node)) { if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) - r = add_opcode(reg, (IS_EFFECT_RECURSION(node) - ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); + r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node) + ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); else - r = add_opcode(reg, (IS_EFFECT_RECURSION(node) - ? OP_MEMORY_END_REC : OP_MEMORY_END)); + r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node) + ? OP_MEMORY_END_REC : OP_MEMORY_END)); if (r) return r; r = add_mem_num(reg, node->regnum); if (r) return r; r = add_opcode(reg, OP_RETURN); } + else if (IS_ENCLOSE_RECURSION(node)) { + if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) + r = add_opcode(reg, OP_MEMORY_END_PUSH_REC); + else + r = add_opcode(reg, OP_MEMORY_END_REC); + if (r) return r; + r = add_mem_num(reg, node->regnum); + } else #endif { if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) - r = add_opcode(reg, OP_MEMORY_END_PUSH); + r = add_opcode(reg, OP_MEMORY_END_PUSH); else - r = add_opcode(reg, OP_MEMORY_END); + r = add_opcode(reg, OP_MEMORY_END); if (r) return r; r = add_mem_num(reg, node->regnum); } break; - case EFFECT_STOP_BACKTRACK: - if (IS_EFFECT_SIMPLE_REPEAT(node)) { - QualifierNode* qn = &NQUALIFIER(node->target); + case ENCLOSE_STOP_BACKTRACK: + /* Disable POP_STOP_BT optimization for simple repeat under the match cache */ + /* optimization because the match cache optimization pushes an extra item to */ + /* the stack and it breaks the assumption for this optimization. */ +#ifndef USE_MATCH_CACHE + if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) { + QtfrNode* qn = NQTFR(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; @@ -1088,15 +1423,64 @@ compile_effect_node(EffectNode* node, regex_t* reg) r = add_opcode(reg, OP_POP); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, - -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP)); + -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP)); } else { +#endif r = add_opcode(reg, OP_PUSH_STOP_BT); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; r = add_opcode(reg, OP_POP_STOP_BT); +#ifndef USE_MATCH_CACHE } +#endif + break; + + case ENCLOSE_CONDITION: + r = add_opcode(reg, OP_CONDITION); + if (r) return r; + r = add_mem_num(reg, node->regnum); + if (r) return r; + + if (NTYPE(node->target) == NT_ALT) { + Node* x = node->target; + int len2; + + len = compile_length_tree(NCAR(x), reg); /* yes-node */ + if (len < 0) return len; + if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; + x = NCDR(x); + len2 = compile_length_tree(NCAR(x), reg); /* no-node */ + if (len2 < 0) return len2; + if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; + + x = node->target; + r = add_rel_addr(reg, len + SIZE_OP_JUMP); + if (r) return r; + r = compile_tree(NCAR(x), reg); /* yes-node */ + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, len2); + if (r) return r; + x = NCDR(x); + r = compile_tree(NCAR(x), reg); /* no-node */ + } + else { + return ONIGERR_PARSER_BUG; + } + break; + + case ENCLOSE_ABSENT: + len = compile_length_tree(node->target, reg); + if (len < 0) return len; + + r = add_opcode(reg, OP_PUSH_ABSENT_POS); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_ABSENT, len + SIZE_OP_ABSENT_END); + if (r) return r; + r = compile_tree(node->target, reg); + if (r) return r; + r = add_opcode(reg, OP_ABSENT_END); break; default: @@ -1153,12 +1537,25 @@ compile_anchor_node(AnchorNode* node, regex_t* reg) case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; - case ANCHOR_WORD_BOUND: r = add_opcode(reg, OP_WORD_BOUND); break; - case ANCHOR_NOT_WORD_BOUND: r = add_opcode(reg, OP_NOT_WORD_BOUND); break; + case ANCHOR_WORD_BOUND: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND); + else r = add_opcode(reg, OP_WORD_BOUND); + break; + case ANCHOR_NOT_WORD_BOUND: + if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND); + else r = add_opcode(reg, OP_NOT_WORD_BOUND); + break; #ifdef USE_WORD_BEGIN_END - case ANCHOR_WORD_BEGIN: r = add_opcode(reg, OP_WORD_BEGIN); break; - case ANCHOR_WORD_END: r = add_opcode(reg, OP_WORD_END); break; + case ANCHOR_WORD_BEGIN: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN); + else r = add_opcode(reg, OP_WORD_BEGIN); + break; + case ANCHOR_WORD_END: + if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END); + else r = add_opcode(reg, OP_WORD_END); + break; #endif + case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break; case ANCHOR_PREC_READ: r = add_opcode(reg, OP_PUSH_POS); @@ -1184,11 +1581,11 @@ compile_anchor_node(AnchorNode* node, regex_t* reg) r = add_opcode(reg, OP_LOOK_BEHIND); if (r) return r; if (node->char_len < 0) { - r = get_char_length_tree(node->target, reg, &n); - if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + r = get_char_length_tree(node->target, reg, &n); + if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else - n = node->char_len; + n = node->char_len; r = add_length(reg, n); if (r) return r; r = compile_tree(node->target, reg); @@ -1200,14 +1597,14 @@ compile_anchor_node(AnchorNode* node, regex_t* reg) int n; len = compile_length_tree(node->target, reg); r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT, - len + SIZE_OP_FAIL_LOOK_BEHIND_NOT); + len + SIZE_OP_FAIL_LOOK_BEHIND_NOT); if (r) return r; if (node->char_len < 0) { - r = get_char_length_tree(node->target, reg, &n); - if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + r = get_char_length_tree(node->target, reg, &n); + if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else - n = node->char_len; + n = node->char_len; r = add_length(reg, n); if (r) return r; r = compile_tree(node->target, reg); @@ -1231,75 +1628,84 @@ compile_length_tree(Node* node, regex_t* reg) type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: len = 0; do { - r = compile_length_tree(NCONS(node).left, reg); + r = compile_length_tree(NCAR(node), reg); if (r < 0) return r; len += r; - } while (IS_NOT_NULL(node = NCONS(node).right)); + } while (IS_NOT_NULL(node = NCDR(node))); r = len; break; - case N_ALT: + case NT_ALT: { - int n; - - n = r = 0; + int n = 0; + len = 0; do { - r += compile_length_tree(NCONS(node).left, reg); - n++; - } while (IS_NOT_NULL(node = NCONS(node).right)); + r = compile_length_tree(NCAR(node), reg); + if (r < 0) return r; + len += r; + n++; + } while (IS_NOT_NULL(node = NCDR(node))); + r = len; r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); } break; - case N_STRING: + case NT_STR: if (NSTRING_IS_RAW(node)) - r = compile_length_string_raw_node(&(NSTRING(node)), reg); + r = compile_length_string_raw_node(NSTR(node), reg); else - r = compile_length_string_node(&(NSTRING(node)), reg); + r = compile_length_string_node(node, reg); break; - case N_CCLASS: - r = compile_length_cclass_node(&(NCCLASS(node)), reg); + case NT_CCLASS: + r = compile_length_cclass_node(NCCLASS(node), reg); break; - case N_CTYPE: - case N_ANYCHAR: + case NT_CTYPE: + case NT_CANY: r = SIZE_OPCODE; break; - case N_BACKREF: + case NT_BREF: { - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); +#ifdef USE_BACKREF_WITH_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH + + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); + } + else +#endif if (br->back_num == 1) { - r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 3) - ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); + r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2) + ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); } else { - r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); + r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); } } break; #ifdef USE_SUBEXP_CALL - case N_CALL: + case NT_CALL: r = SIZE_OP_CALL; break; #endif - case N_QUALIFIER: - r = compile_length_qualifier_node(&(NQUALIFIER(node)), reg); + case NT_QTFR: + r = compile_length_quantifier_node(NQTFR(node), reg); break; - case N_EFFECT: - r = compile_length_effect_node(&NEFFECT(node), reg); + case NT_ENCLOSE: + r = compile_length_enclose_node(NENCLOSE(node), reg); break; - case N_ANCHOR: - r = compile_length_anchor_node(&(NANCHOR(node)), reg); + case NT_ANCHOR: + r = compile_length_anchor_node(NANCHOR(node), reg); break; default: @@ -1317,131 +1723,160 @@ compile_tree(Node* node, regex_t* reg) type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: do { - r = compile_tree(NCONS(node).left, reg); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = compile_tree(NCAR(node), reg); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_ALT: + case NT_ALT: { Node* x = node; len = 0; do { - len += compile_length_tree(NCONS(x).left, reg); - if (NCONS(x).right != NULL) { - len += SIZE_OP_PUSH + SIZE_OP_JUMP; - } - } while (IS_NOT_NULL(x = NCONS(x).right)); + len += compile_length_tree(NCAR(x), reg); + if (NCDR(x) != NULL) { + len += SIZE_OP_PUSH + SIZE_OP_JUMP; + } + } while (IS_NOT_NULL(x = NCDR(x))); pos = reg->used + len; /* goal position */ do { - len = compile_length_tree(NCONS(node).left, reg); - if (IS_NOT_NULL(NCONS(node).right)) { - r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP); - if (r) break; - } - r = compile_tree(NCONS(node).left, reg); - if (r) break; - if (IS_NOT_NULL(NCONS(node).right)) { - len = pos - (reg->used + SIZE_OP_JUMP); - r = add_opcode_rel_addr(reg, OP_JUMP, len); - if (r) break; - } - } while (IS_NOT_NULL(node = NCONS(node).right)); + len = compile_length_tree(NCAR(node), reg); + if (IS_NOT_NULL(NCDR(node))) { + r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP); + if (r) break; + } + r = compile_tree(NCAR(node), reg); + if (r) break; + if (IS_NOT_NULL(NCDR(node))) { + len = pos - (reg->used + SIZE_OP_JUMP); + r = add_opcode_rel_addr(reg, OP_JUMP, len); + if (r) break; + } + } while (IS_NOT_NULL(node = NCDR(node))); } break; - case N_STRING: + case NT_STR: if (NSTRING_IS_RAW(node)) - r = compile_string_raw_node(&(NSTRING(node)), reg); + r = compile_string_raw_node(NSTR(node), reg); else - r = compile_string_node(&(NSTRING(node)), reg); + r = compile_string_node(node, reg); break; - case N_CCLASS: - r = compile_cclass_node(&(NCCLASS(node)), reg); + case NT_CCLASS: + r = compile_cclass_node(NCCLASS(node), reg); break; - case N_CTYPE: + case NT_CTYPE: { int op; - switch (NCTYPE(node).type) { - case CTYPE_WORD: op = OP_WORD; break; - case CTYPE_NOT_WORD: op = OP_NOT_WORD; break; + switch (NCTYPE(node)->ctype) { + case ONIGENC_CTYPE_WORD: + if (NCTYPE(node)->ascii_range != 0) { + if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD; + else op = OP_ASCII_WORD; + } + else { + if (NCTYPE(node)->not != 0) op = OP_NOT_WORD; + else op = OP_WORD; + } + break; default: - return ONIGERR_TYPE_BUG; - break; + return ONIGERR_TYPE_BUG; + break; } r = add_opcode(reg, op); } break; - case N_ANYCHAR: + case NT_CANY: if (IS_MULTILINE(reg->options)) r = add_opcode(reg, OP_ANYCHAR_ML); else r = add_opcode(reg, OP_ANYCHAR); break; - case N_BACKREF: + case NT_BREF: { - int i; - BackrefNode* br = &(NBACKREF(node)); - + BRefNode* br = NBREF(node); + +#ifdef USE_BACKREF_WITH_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + r = add_opcode(reg, OP_BACKREF_WITH_LEVEL); + if (r) return r; + r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE)); + if (r) return r; + r = add_length(reg, br->nest_level); + if (r) return r; + + goto add_bacref_mems; + } + else +#endif if (br->back_num == 1) { - n = br->back_static[0]; - if (IS_IGNORECASE(reg->options)) { - r = add_opcode(reg, OP_BACKREFN_IC); - if (r) return r; - r = add_mem_num(reg, n); - } - else { - switch (n) { - case 1: r = add_opcode(reg, OP_BACKREF1); break; - case 2: r = add_opcode(reg, OP_BACKREF2); break; - case 3: r = add_opcode(reg, OP_BACKREF3); break; - default: - r = add_opcode(reg, OP_BACKREFN); - if (r) return r; - r = add_mem_num(reg, n); - break; - } - } + n = br->back_static[0]; + if (IS_IGNORECASE(reg->options)) { + r = add_opcode(reg, OP_BACKREFN_IC); + if (r) return r; + r = add_mem_num(reg, n); + } + else { + switch (n) { + case 1: r = add_opcode(reg, OP_BACKREF1); break; + case 2: r = add_opcode(reg, OP_BACKREF2); break; + default: + r = add_opcode(reg, OP_BACKREFN); + if (r) return r; + r = add_mem_num(reg, n); + break; + } + } } else { - int* p; - add_opcode(reg, (IS_IGNORECASE(reg->options) ? - OP_BACKREF_MULTI_IC : OP_BACKREF_MULTI)); - if (r) return r; - add_length(reg, br->back_num); - if (r) return r; - p = BACKREFS_P(br); - for (i = br->back_num - 1; i >= 0; i--) { - r = add_mem_num(reg, p[i]); - if (r) return r; - } + int i; + int* p; + + if (IS_IGNORECASE(reg->options)) { + r = add_opcode(reg, OP_BACKREF_MULTI_IC); + } + else { + r = add_opcode(reg, OP_BACKREF_MULTI); + } + if (r) return r; + +#ifdef USE_BACKREF_WITH_LEVEL + add_bacref_mems: +#endif + r = add_length(reg, br->back_num); + if (r) return r; + p = BACKREFS_P(br); + for (i = br->back_num - 1; i >= 0; i--) { + r = add_mem_num(reg, p[i]); + if (r) return r; + } } } break; #ifdef USE_SUBEXP_CALL - case N_CALL: - r = compile_call(&(NCALL(node)), reg); + case NT_CALL: + r = compile_call(NCALL(node), reg); break; #endif - case N_QUALIFIER: - r = compile_qualifier_node(&(NQUALIFIER(node)), reg); + case NT_QTFR: + r = compile_quantifier_node(NQTFR(node), reg); break; - case N_EFFECT: - r = compile_effect_node(&NEFFECT(node), reg); + case NT_ENCLOSE: + r = compile_enclose_node(NENCLOSE(node), reg); break; - case N_ANCHOR: - r = compile_anchor_node(&(NANCHOR(node)), reg); + case NT_ANCHOR: + r = compile_anchor_node(NANCHOR(node), reg); break; default: @@ -1455,57 +1890,58 @@ compile_tree(Node* node, regex_t* reg) } #ifdef USE_NAMED_GROUP -typedef struct { - int new_val; -} NumMap; static int -noname_disable_map(Node** plink, NumMap* map, int* counter) +noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { int r = 0; Node* node = *plink; switch (NTYPE(node)) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: do { - r = noname_disable_map(&(NCONS(node).left), map, counter); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = noname_disable_map(&(NCAR(node)), map, counter); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: + case NT_QTFR: { - Node** ptarget = &(NQUALIFIER(node).target); + Node** ptarget = &(NQTFR(node)->target); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); - if (*ptarget != old && NTYPE(*ptarget) == N_QUALIFIER) { - onig_reduce_nested_qualifier(node, *ptarget); + if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) { + onig_reduce_nested_quantifier(node, *ptarget); } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); - if (en->type == EFFECT_MEMORY) { - if (IS_EFFECT_NAMED_GROUP(en)) { - (*counter)++; - map[en->regnum].new_val = *counter; - en->regnum = *counter; - r = noname_disable_map(&(en->target), map, counter); - } - else { - *plink = en->target; - en->target = NULL_NODE; - onig_node_free(node); - r = noname_disable_map(plink, map, counter); - } + EncloseNode* en = NENCLOSE(node); + if (en->type == ENCLOSE_MEMORY) { + if (IS_ENCLOSE_NAMED_GROUP(en)) { + (*counter)++; + map[en->regnum].new_val = *counter; + en->regnum = *counter; + } + else if (en->regnum != 0) { + *plink = en->target; + en->target = NULL_NODE; + onig_node_free(node); + r = noname_disable_map(plink, map, counter); + break; + } } - else - r = noname_disable_map(&(en->target), map, counter); + r = noname_disable_map(&(en->target), map, counter); } break; + case NT_ANCHOR: + if (NANCHOR(node)->target) + r = noname_disable_map(&(NANCHOR(node)->target), map, counter); + break; + default: break; } @@ -1514,11 +1950,11 @@ noname_disable_map(Node** plink, NumMap* map, int* counter) } static int -renumber_node_backref(Node* node, NumMap* map) +renumber_node_backref(Node* node, GroupNumRemap* map, const int num_mem) { int i, pos, n, old_num; int *backs; - BackrefNode* bn = &(NBACKREF(node)); + BRefNode* bn = NBREF(node); if (! IS_BACKREF_NAME_REF(bn)) return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; @@ -1530,6 +1966,7 @@ renumber_node_backref(Node* node, NumMap* map) backs = bn->back_dynamic; for (i = 0, pos = 0; i < old_num; i++) { + if (backs[i] > num_mem) return ONIGERR_INVALID_BACKREF; n = map[backs[i]].new_val; if (n > 0) { backs[pos] = n; @@ -1542,26 +1979,38 @@ renumber_node_backref(Node* node, NumMap* map) } static int -renumber_by_map(Node* node, NumMap* map) +renumber_by_map(Node* node, GroupNumRemap* map, const int num_mem) { int r = 0; switch (NTYPE(node)) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: do { - r = renumber_by_map(NCONS(node).left, map); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = renumber_by_map(NCAR(node), map, num_mem); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = renumber_by_map(NQUALIFIER(node).target, map); + case NT_QTFR: + r = renumber_by_map(NQTFR(node)->target, map, num_mem); break; - case N_EFFECT: - r = renumber_by_map(NEFFECT(node).target, map); + case NT_ENCLOSE: + { + EncloseNode* en = NENCLOSE(node); + if (en->type == ENCLOSE_CONDITION) { + if (en->regnum > num_mem) return ONIGERR_INVALID_BACKREF; + en->regnum = map[en->regnum].new_val; + } + r = renumber_by_map(en->target, map, num_mem); + } + break; + + case NT_BREF: + r = renumber_node_backref(node, map, num_mem); break; - case N_BACKREF: - r = renumber_node_backref(node, map); + case NT_ANCHOR: + if (NANCHOR(node)->target) + r = renumber_by_map(NANCHOR(node)->target, map, num_mem); break; default: @@ -1577,24 +2026,29 @@ numbered_ref_check(Node* node) int r = 0; switch (NTYPE(node)) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: do { - r = numbered_ref_check(NCONS(node).left); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = numbered_ref_check(NCAR(node)); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = numbered_ref_check(NQUALIFIER(node).target); + case NT_QTFR: + r = numbered_ref_check(NQTFR(node)->target); break; - case N_EFFECT: - r = numbered_ref_check(NEFFECT(node).target); + case NT_ENCLOSE: + r = numbered_ref_check(NENCLOSE(node)->target); break; - case N_BACKREF: - if (! IS_BACKREF_NAME_REF(&(NBACKREF(node)))) + case NT_BREF: + if (! IS_BACKREF_NAME_REF(NBREF(node))) return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; break; + case NT_ANCHOR: + if (NANCHOR(node)->target) + r = numbered_ref_check(NANCHOR(node)->target); + break; + default: break; } @@ -1607,10 +2061,10 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) { int r, i, pos, counter; BitStatusType loc; - NumMap* map; + GroupNumRemap* map; - map = (NumMap* )xalloca(sizeof(NumMap) * (env->num_mem + 1)); - CHECK_NULL_RETURN_VAL(map, ONIGERR_MEMORY); + map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1)); + CHECK_NULL_RETURN_MEMERR(map); for (i = 1; i <= env->num_mem; i++) { map[i].new_val = 0; } @@ -1618,7 +2072,7 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) r = noname_disable_map(root, map, &counter); if (r != 0) return r; - r = renumber_by_map(*root, map); + r = renumber_by_map(*root, map, env->num_mem); if (r != 0) return r; for (i = 1, pos = 1; i <= env->num_mem; i++) { @@ -1638,7 +2092,8 @@ disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) env->num_mem = env->num_named; reg->num_mem = env->num_named; - return 0; + + return onig_renumber_name_table(reg, map); } #endif /* USE_NAMED_GROUP */ @@ -1647,97 +2102,91 @@ static int unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) { int i, offset; - EffectNode* en; + EncloseNode* en; AbsAddrType addr; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS - UChar buf[SERIALIZE_BUFSIZE]; -#endif for (i = 0; i < uslist->num; i++) { - en = &(NEFFECT(uslist->us[i].target)); - if (! IS_EFFECT_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG; + en = NENCLOSE(uslist->us[i].target); + if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG; addr = en->call_addr; offset = uslist->us[i].offset; -#ifdef PLATFORM_UNALIGNED_WORD_ACCESS BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); -#else - SERIALIZE_ABSADDR(addr, buf); - BBUF_WRITE(reg, offset, buf, SIZE_ABSADDR); -#endif } return 0; } #endif -#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK +#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT static int -qualifiers_memory_node_info(Node* node) +quantifiers_memory_node_info(Node* node) { int r = 0; switch (NTYPE(node)) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: { int v; do { - v = qualifiers_memory_node_info(NCONS(node).left); - if (v > r) r = v; - } while (v >= 0 && IS_NOT_NULL(node = NCONS(node).right)); + v = quantifiers_memory_node_info(NCAR(node)); + if (v > r) r = v; + } while (v >= 0 && IS_NOT_NULL(node = NCDR(node))); } break; -#ifdef USE_SUBEXP_CALL - case N_CALL: - if (IS_CALL_RECURSION(&NCALL(node))) { +# ifdef USE_SUBEXP_CALL + case NT_CALL: + if (IS_CALL_RECURSION(NCALL(node))) { return NQ_TARGET_IS_EMPTY_REC; /* tiny version */ } else - r = qualifiers_memory_node_info(NCALL(node).target); + r = quantifiers_memory_node_info(NCALL(node)->target); break; -#endif +# endif - case N_QUALIFIER: + case NT_QTFR: { - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); if (qn->upper != 0) { - r = qualifiers_memory_node_info(qn->target); + r = quantifiers_memory_node_info(qn->target); } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_MEMORY: - return NQ_TARGET_IS_EMPTY_MEM; - break; - - case EFFECT_OPTION: - case EFFECT_STOP_BACKTRACK: - r = qualifiers_memory_node_info(en->target); - break; + case ENCLOSE_MEMORY: + return NQ_TARGET_IS_EMPTY_MEM; + break; + + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + case ENCLOSE_ABSENT: + r = quantifiers_memory_node_info(en->target); + break; default: - break; + break; } } break; - case N_BACKREF: - case N_STRING: - case N_CTYPE: - case N_CCLASS: - case N_ANYCHAR: - case N_ANCHOR: + case NT_BREF: + case NT_STR: + case NT_CTYPE: + case NT_CCLASS: + case NT_CANY: + case NT_ANCHOR: default: break; } return r; } -#endif /* USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK */ +#endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */ static int get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) @@ -1747,12 +2196,12 @@ get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) *min = 0; switch (NTYPE(node)) { - case N_BACKREF: + case NT_BREF: { int i; int* backs; Node** nodes = SCANENV_MEM_NODES(env); - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) break; backs = BACKREFS_P(br); @@ -1760,106 +2209,110 @@ get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) r = get_min_match_length(nodes[backs[0]], min, env); if (r != 0) break; for (i = 1; i < br->back_num; i++) { - if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - r = get_min_match_length(nodes[backs[i]], &tmin, env); - if (r != 0) break; - if (*min > tmin) *min = tmin; + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_min_match_length(nodes[backs[i]], &tmin, env); + if (r != 0) break; + if (*min > tmin) *min = tmin; } } break; #ifdef USE_SUBEXP_CALL - case N_CALL: - if (IS_CALL_RECURSION(&NCALL(node))) { - EffectNode* en = &(NEFFECT(NCALL(node).target)); - if (IS_EFFECT_MIN_FIXED(en)) - *min = en->min_len; + case NT_CALL: + if (IS_CALL_RECURSION(NCALL(node))) { + EncloseNode* en = NENCLOSE(NCALL(node)->target); + if (IS_ENCLOSE_MIN_FIXED(en)) + *min = en->min_len; } else - r = get_min_match_length(NCALL(node).target, min, env); + r = get_min_match_length(NCALL(node)->target, min, env); break; #endif - case N_LIST: + case NT_LIST: do { - r = get_min_match_length(NCONS(node).left, &tmin, env); + r = get_min_match_length(NCAR(node), &tmin, env); if (r == 0) *min += tmin; - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_ALT: + case NT_ALT: { Node *x, *y; y = node; do { - x = NCONS(y).left; - r = get_min_match_length(x, &tmin, env); - if (r != 0) break; - if (y == node) *min = tmin; - else if (*min > tmin) *min = tmin; - } while (r == 0 && IS_NOT_NULL(y = NCONS(y).right)); + x = NCAR(y); + r = get_min_match_length(x, &tmin, env); + if (r != 0) break; + if (y == node) *min = tmin; + else if (*min > tmin) *min = tmin; + } while (r == 0 && IS_NOT_NULL(y = NCDR(y))); } break; - case N_STRING: + case NT_STR: { - StrNode* sn = &(NSTRING(node)); + StrNode* sn = NSTR(node); *min = sn->end - sn->s; } break; - case N_CTYPE: - switch (NCTYPE(node).type) { - case CTYPE_WORD: *min = 1; break; - case CTYPE_NOT_WORD: *min = 1; break; - default: - break; - } + case NT_CTYPE: + *min = 1; break; - case N_CCLASS: - case N_ANYCHAR: + case NT_CCLASS: + case NT_CANY: *min = 1; break; - case N_QUALIFIER: + case NT_QTFR: { - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); if (qn->lower > 0) { - r = get_min_match_length(qn->target, min, env); - if (r == 0) - *min = distance_multiply(*min, qn->lower); + r = get_min_match_length(qn->target, min, env); + if (r == 0) + *min = distance_multiply(*min, qn->lower); } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_MEMORY: -#ifdef USE_SUBEXP_CALL - if (IS_EFFECT_MIN_FIXED(en)) - *min = en->min_len; - else { - r = get_min_match_length(en->target, min, env); - if (r == 0) { - en->min_len = *min; - SET_EFFECT_STATUS(node, NST_MIN_FIXED); - } - } - break; -#endif - case EFFECT_OPTION: - case EFFECT_STOP_BACKTRACK: - r = get_min_match_length(en->target, min, env); - break; + case ENCLOSE_MEMORY: + if (IS_ENCLOSE_MIN_FIXED(en)) + *min = en->min_len; + else { + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *min = 0; /* recursive */ + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_min_match_length(en->target, min, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->min_len = *min; + SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); + } + } + } + break; + + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + r = get_min_match_length(en->target, min, env); + break; + + case ENCLOSE_ABSENT: + break; } } break; - case N_ANCHOR: + case NT_ANCHOR: default: break; } @@ -1875,116 +2328,117 @@ get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) *max = 0; switch (NTYPE(node)) { - case N_LIST: + case NT_LIST: do { - r = get_max_match_length(NCONS(node).left, &tmax, env); + r = get_max_match_length(NCAR(node), &tmax, env); if (r == 0) - *max = distance_add(*max, tmax); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + *max = distance_add(*max, tmax); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_ALT: + case NT_ALT: do { - r = get_max_match_length(NCONS(node).left, &tmax, env); + r = get_max_match_length(NCAR(node), &tmax, env); if (r == 0 && *max < tmax) *max = tmax; - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_STRING: + case NT_STR: { - StrNode* sn = &(NSTRING(node)); + StrNode* sn = NSTR(node); *max = sn->end - sn->s; } break; - case N_CTYPE: - switch (NCTYPE(node).type) { - case CTYPE_WORD: - case CTYPE_NOT_WORD: - *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); - break; - - default: - break; - } + case NT_CTYPE: + *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); break; - case N_CCLASS: - case N_ANYCHAR: + case NT_CCLASS: + case NT_CANY: *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); break; - case N_BACKREF: + case NT_BREF: { int i; int* backs; Node** nodes = SCANENV_MEM_NODES(env); - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) { - *max = ONIG_INFINITE_DISTANCE; - break; + *max = ONIG_INFINITE_DISTANCE; + break; } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - r = get_max_match_length(nodes[backs[i]], &tmax, env); - if (r != 0) break; - if (*max < tmax) *max = tmax; + if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + r = get_max_match_length(nodes[backs[i]], &tmax, env); + if (r != 0) break; + if (*max < tmax) *max = tmax; } } break; #ifdef USE_SUBEXP_CALL - case N_CALL: - if (! IS_CALL_RECURSION(&(NCALL(node)))) - r = get_max_match_length(NCALL(node).target, max, env); + case NT_CALL: + if (! IS_CALL_RECURSION(NCALL(node))) + r = get_max_match_length(NCALL(node)->target, max, env); else *max = ONIG_INFINITE_DISTANCE; break; #endif - case N_QUALIFIER: + case NT_QTFR: { - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); if (qn->upper != 0) { - r = get_max_match_length(qn->target, max, env); - if (r == 0 && *max != 0) { - if (! IS_REPEAT_INFINITE(qn->upper)) - *max = distance_multiply(*max, qn->upper); - else - *max = ONIG_INFINITE_DISTANCE; - } + r = get_max_match_length(qn->target, max, env); + if (r == 0 && *max != 0) { + if (! IS_REPEAT_INFINITE(qn->upper)) + *max = distance_multiply(*max, qn->upper); + else + *max = ONIG_INFINITE_DISTANCE; + } } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_MEMORY: -#ifdef USE_SUBEXP_CALL - if (IS_EFFECT_MAX_FIXED(en)) - *max = en->max_len; - else { - r = get_max_match_length(en->target, max, env); - if (r == 0) { - en->max_len = *max; - SET_EFFECT_STATUS(node, NST_MAX_FIXED); - } - } - break; -#endif - case EFFECT_OPTION: - case EFFECT_STOP_BACKTRACK: - r = get_max_match_length(en->target, max, env); - break; + case ENCLOSE_MEMORY: + if (IS_ENCLOSE_MAX_FIXED(en)) + *max = en->max_len; + else { + if (IS_ENCLOSE_MARK1(NENCLOSE(node))) + *max = ONIG_INFINITE_DISTANCE; + else { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = get_max_match_length(en->target, max, env); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + if (r == 0) { + en->max_len = *max; + SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); + } + } + } + break; + + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + r = get_max_match_length(en->target, max, env); + break; + + case ENCLOSE_ABSENT: + break; } } break; - case N_ANCHOR: + case NT_ANCHOR: default: break; } @@ -2005,115 +2459,112 @@ get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) level++; *len = 0; switch (NTYPE(node)) { - case N_LIST: + case NT_LIST: do { - r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); + r = get_char_length_tree1(NCAR(node), reg, &tlen, level); if (r == 0) - *len = distance_add(*len, tlen); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + *len = (int )distance_add(*len, tlen); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_ALT: + case NT_ALT: { int tlen2; int varlen = 0; - r = get_char_length_tree1(NCONS(node).left, reg, &tlen, level); - while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)) { - r = get_char_length_tree1(NCONS(node).left, reg, &tlen2, level); - if (r == 0) { - if (tlen != tlen2) - varlen = 1; - } + r = get_char_length_tree1(NCAR(node), reg, &tlen, level); + while (r == 0 && IS_NOT_NULL(node = NCDR(node))) { + r = get_char_length_tree1(NCAR(node), reg, &tlen2, level); + if (r == 0) { + if (tlen != tlen2) + varlen = 1; + } } if (r == 0) { - if (varlen != 0) { - if (level == 1) - r = GET_CHAR_LEN_TOP_ALT_VARLEN; - else - r = GET_CHAR_LEN_VARLEN; - } - else - *len = tlen; + if (varlen != 0) { + if (level == 1) + r = GET_CHAR_LEN_TOP_ALT_VARLEN; + else + r = GET_CHAR_LEN_VARLEN; + } + else + *len = tlen; } } break; - case N_STRING: + case NT_STR: { - StrNode* sn = &(NSTRING(node)); + StrNode* sn = NSTR(node); UChar *s = sn->s; while (s < sn->end) { - s += enc_len(reg->enc, *s); - (*len)++; + s += enclen(reg->enc, s, sn->end); + (*len)++; } } break; - case N_QUALIFIER: + case NT_QTFR: { - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); if (qn->lower == qn->upper) { - r = get_char_length_tree1(qn->target, reg, &tlen, level); - if (r == 0) - *len = distance_multiply(tlen, qn->lower); + r = get_char_length_tree1(qn->target, reg, &tlen, level); + if (r == 0) + *len = (int )distance_multiply(tlen, qn->lower); } else - r = GET_CHAR_LEN_VARLEN; + r = GET_CHAR_LEN_VARLEN; } break; #ifdef USE_SUBEXP_CALL - case N_CALL: - if (! IS_CALL_RECURSION(&(NCALL(node)))) - r = get_char_length_tree1(NCALL(node).target, reg, len, level); + case NT_CALL: + if (! IS_CALL_RECURSION(NCALL(node))) + r = get_char_length_tree1(NCALL(node)->target, reg, len, level); else r = GET_CHAR_LEN_VARLEN; break; #endif - case N_CTYPE: - switch (NCTYPE(node).type) { - case CTYPE_WORD: - case CTYPE_NOT_WORD: - *len = 1; - break; - } + case NT_CTYPE: + *len = 1; break; - case N_CCLASS: - case N_ANYCHAR: + case NT_CCLASS: + case NT_CANY: *len = 1; break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_MEMORY: + case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL - if (IS_EFFECT_CLEN_FIXED(en)) - *len = en->char_len; - else { - r = get_char_length_tree1(en->target, reg, len, level); - if (r == 0) { - en->char_len = *len; - SET_EFFECT_STATUS(node, NST_CLEN_FIXED); - } - } - break; + if (IS_ENCLOSE_CLEN_FIXED(en)) + *len = en->char_len; + else { + r = get_char_length_tree1(en->target, reg, len, level); + if (r == 0) { + en->char_len = *len; + SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED); + } + } + break; #endif - case EFFECT_OPTION: - case EFFECT_STOP_BACKTRACK: - r = get_char_length_tree1(en->target, reg, len, level); - break; + case ENCLOSE_OPTION: + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + r = get_char_length_tree1(en->target, reg, len, level); + break; + case ENCLOSE_ABSENT: default: - break; + break; } } break; - case N_ANCHOR: + case NT_ANCHOR: break; default: @@ -2130,210 +2581,191 @@ get_char_length_tree(Node* node, regex_t* reg, int* len) return get_char_length_tree1(node, reg, len, 0); } -extern int -onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) -{ - int found; - - if (code >= SINGLE_BYTE_SIZE) { - if (IS_NULL(cc->mbuf)) { - found = 0; - } - else { - found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); - } - } - else { - found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); - } - - if (cc->not == 0) - return found; - else - return !found; -} - /* x is not included y ==> 1 : 0 */ static int is_not_included(Node* x, Node* y, regex_t* reg) { - int i, len; + int i; + OnigDistance len; OnigCodePoint code; - UChar *p, c; + UChar *p; int ytype; retry: ytype = NTYPE(y); switch (NTYPE(x)) { - case N_CTYPE: + case NT_CTYPE: { switch (ytype) { - case N_CTYPE: - switch (NCTYPE(x).type) { - case CTYPE_WORD: - if (NCTYPE(y).type == CTYPE_NOT_WORD) - return 1; - else - return 0; - break; - case CTYPE_NOT_WORD: - if (NCTYPE(y).type == CTYPE_WORD) - return 1; - else - return 0; - break; - default: - break; - } - break; - - case N_CCLASS: + case NT_CTYPE: + if (NCTYPE(y)->ctype == NCTYPE(x)->ctype && + NCTYPE(y)->not != NCTYPE(x)->not && + NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range) + return 1; + else + return 0; + break; + + case NT_CCLASS: swap: - { - Node* tmp; - tmp = x; x = y; y = tmp; - goto retry; - } - break; + { + Node* tmp; + tmp = x; x = y; y = tmp; + goto retry; + } + break; - case N_STRING: - goto swap; - break; + case NT_STR: + goto swap; + break; default: - break; + break; } } break; - case N_CCLASS: + case NT_CCLASS: { - CClassNode* xc = &(NCCLASS(x)); + CClassNode* xc = NCCLASS(x); switch (ytype) { - case N_CTYPE: - switch (NCTYPE(y).type) { - case CTYPE_WORD: - if (IS_NULL(xc->mbuf) && xc->not == 0) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (BITSET_AT(xc->bs, i)) { - if (ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) return 0; - } - } - return 1; - } - return 0; - break; - case CTYPE_NOT_WORD: - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - if (! ONIGENC_IS_CODE_SB_WORD(reg->enc, i)) { - if (xc->not == 0) { - if (BITSET_AT(xc->bs, i)) - return 0; - } - else { - if (! BITSET_AT(xc->bs, i)) - return 0; - } - } - } - return 1; - break; - - default: - break; - } - break; - - case N_CCLASS: - { - int v; - CClassNode* yc = &(NCCLASS(y)); - - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - v = BITSET_AT(xc->bs, i); - if ((v != 0 && xc->not == 0) || (v == 0 && xc->not)) { - v = BITSET_AT(yc->bs, i); - if ((v != 0 && yc->not == 0) || (v == 0 && yc->not)) - return 0; - } - } - if ((IS_NULL(xc->mbuf) && xc->not == 0) || - (IS_NULL(yc->mbuf) && yc->not == 0)) - return 1; - return 0; - } - break; + case NT_CTYPE: + switch (NCTYPE(y)->ctype) { + case ONIGENC_CTYPE_WORD: + if (NCTYPE(y)->not == 0) { + if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (BITSET_AT(xc->bs, i)) { + if (NCTYPE(y)->ascii_range) { + if (IS_CODE_SB_WORD(reg->enc, i)) return 0; + } + else { + if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0; + } + } + } + return 1; + } + return 0; + } + else { + if (IS_NOT_NULL(xc->mbuf)) return 0; + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + int is_word; + if (NCTYPE(y)->ascii_range) + is_word = IS_CODE_SB_WORD(reg->enc, i); + else + is_word = ONIGENC_IS_CODE_WORD(reg->enc, i); + if (! is_word) { + if (!IS_NCCLASS_NOT(xc)) { + if (BITSET_AT(xc->bs, i)) + return 0; + } + else { + if (! BITSET_AT(xc->bs, i)) + return 0; + } + } + } + return 1; + } + break; + + default: + break; + } + break; + + case NT_CCLASS: + { + int v; + CClassNode* yc = NCCLASS(y); + + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + v = BITSET_AT(xc->bs, i); + if ((v != 0 && !IS_NCCLASS_NOT(xc)) || + (v == 0 && IS_NCCLASS_NOT(xc))) { + v = BITSET_AT(yc->bs, i); + if ((v != 0 && !IS_NCCLASS_NOT(yc)) || + (v == 0 && IS_NCCLASS_NOT(yc))) + return 0; + } + } + if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) || + (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc))) + return 1; + return 0; + } + break; - case N_STRING: - goto swap; - break; + case NT_STR: + goto swap; + break; default: - break; + break; } } break; - case N_STRING: + case NT_STR: { - StrNode* xs = &(NSTRING(x)); + StrNode* xs = NSTR(x); if (NSTRING_LEN(x) == 0) - break; + break; - c = *(xs->s); switch (ytype) { - case N_CTYPE: - switch (NCTYPE(y).type) { - case CTYPE_WORD: - return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 0 : 1); - break; - case CTYPE_NOT_WORD: - return (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end) ? 1 : 0); - break; - default: - break; - } - break; + case NT_CTYPE: + switch (NCTYPE(y)->ctype) { + case ONIGENC_CTYPE_WORD: + if (NCTYPE(y)->ascii_range) { + if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end)) + return NCTYPE(y)->not; + else + return !(NCTYPE(y)->not); + } + else { + if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) + return NCTYPE(y)->not; + else + return !(NCTYPE(y)->not); + } + break; + default: + break; + } + break; - case N_CCLASS: - { - CClassNode* cc = &(NCCLASS(y)); + case NT_CCLASS: + { + CClassNode* cc = NCCLASS(y); + + code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, + xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); + return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); + } + break; + + case NT_STR: + { + UChar *q; + StrNode* ys = NSTR(y); + len = NSTRING_LEN(x); + if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); + if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { + /* tiny version */ + return 0; + } + else { + for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) { + if (*p != *q) return 1; + } + } + } + break; - code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, - xs->s + enc_len(reg->enc, c)); - return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); - } - break; - - case N_STRING: - { - UChar *q; - StrNode* ys = &(NSTRING(y)); - len = NSTRING_LEN(x); - if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); - if (NSTRING_IS_CASE_AMBIG(x) || NSTRING_IS_CASE_AMBIG(y)) { - UChar plow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - UChar qlow[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int plen, qlen; - for (p = ys->s, q = xs->s; q < xs->end; ) { - plen = ONIGENC_MBC_TO_LOWER(reg->enc, p, plow); - qlen = ONIGENC_MBC_TO_LOWER(reg->enc, q, qlow); - if (plen != qlen || onig_strncmp(plow, qlow, plen) != 0) - return 1; - p += enc_len(reg->enc, *p); - q += enc_len(reg->enc, *q); - } - } - else { - for (i = 0, p = ys->s, q = xs->s; i < len; i++, p++, q++) { - if (*p != *q) return 1; - } - } - } - break; - default: - break; + break; } } break; @@ -2351,80 +2783,81 @@ get_head_value_node(Node* node, int exact, regex_t* reg) Node* n = NULL_NODE; switch (NTYPE(node)) { - case N_BACKREF: - case N_ALT: - case N_ANYCHAR: + case NT_BREF: + case NT_ALT: + case NT_CANY: #ifdef USE_SUBEXP_CALL - case N_CALL: + case NT_CALL: #endif break; - case N_CTYPE: - case N_CCLASS: + case NT_CTYPE: + case NT_CCLASS: if (exact == 0) { n = node; } break; - case N_LIST: - n = get_head_value_node(NCONS(node).left, exact, reg); + case NT_LIST: + n = get_head_value_node(NCAR(node), exact, reg); break; - case N_STRING: + case NT_STR: { - StrNode* sn = &(NSTRING(node)); - + StrNode* sn = NSTR(node); if (sn->end <= sn->s) - break; + break; - if (exact != 0 && - !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - if (! ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, sn->s)) - n = node; - } - else { - n = node; + if (exact == 0 || + NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) { + n = node; } } break; - case N_QUALIFIER: + case NT_QTFR: { - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); if (qn->lower > 0) { - if (IS_NOT_NULL(qn->head_exact)) - n = qn->head_exact; - else - n = get_head_value_node(qn->target, exact, reg); +#ifdef USE_OP_PUSH_OR_JUMP_EXACT + if (IS_NOT_NULL(qn->head_exact)) + n = qn->head_exact; + else +#endif + n = get_head_value_node(qn->target, exact, reg); } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_OPTION: - { - OnigOptionType options = reg->options; + case ENCLOSE_OPTION: + { + OnigOptionType options = reg->options; - reg->options = NEFFECT(node).option; - n = get_head_value_node(NEFFECT(node).target, exact, reg); - reg->options = options; - } - break; + reg->options = NENCLOSE(node)->option; + n = get_head_value_node(NENCLOSE(node)->target, exact, reg); + reg->options = options; + } + break; - case EFFECT_MEMORY: - case EFFECT_STOP_BACKTRACK: - n = get_head_value_node(en->target, exact, reg); - break; + case ENCLOSE_MEMORY: + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + n = get_head_value_node(en->target, exact, reg); + break; + + case ENCLOSE_ABSENT: + break; } } break; - case N_ANCHOR: - if (NANCHOR(node).type == ANCHOR_PREC_READ) - n = get_head_value_node(NANCHOR(node).target, exact, reg); + case NT_ANCHOR: + if (NANCHOR(node)->type == ANCHOR_PREC_READ) + n = get_head_value_node(NANCHOR(node)->target, exact, reg); break; default: @@ -2435,45 +2868,46 @@ get_head_value_node(Node* node, int exact, regex_t* reg) } static int -check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask) +check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask) { int type, r = 0; type = NTYPE(node); - if ((type & type_mask) == 0) + if ((NTYPE2BIT(type) & type_mask) == 0) return 1; switch (type) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: do { - r = check_type_tree(NCONS(node).left, type_mask, effect_mask, anchor_mask); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = check_type_tree(NCAR(node), type_mask, enclose_mask, + anchor_mask); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = check_type_tree(NQUALIFIER(node).target, type_mask, effect_mask, - anchor_mask); + case NT_QTFR: + r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask, + anchor_mask); break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); - if ((en->type & effect_mask) == 0) - return 1; + EncloseNode* en = NENCLOSE(node); + if ((en->type & enclose_mask) == 0) + return 1; - r = check_type_tree(en->target, type_mask, effect_mask, anchor_mask); + r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask); } break; - case N_ANCHOR: - type = NANCHOR(node).type; + case NT_ANCHOR: + type = NANCHOR(node)->type; if ((type & anchor_mask) == 0) return 1; - if (NANCHOR(node).target) - r = check_type_tree(NANCHOR(node).target, - type_mask, effect_mask, anchor_mask); + if (NANCHOR(node)->target) + r = check_type_tree(NANCHOR(node)->target, + type_mask, enclose_mask, anchor_mask); break; default: @@ -2484,8 +2918,8 @@ check_type_tree(Node* node, int type_mask, int effect_mask, int anchor_mask) #ifdef USE_SUBEXP_CALL -#define RECURSION_EXIST 1 -#define RECURSION_INFINITE 2 +# define RECURSION_EXIST 1 +# define RECURSION_INFINITE 2 static int subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) @@ -2495,7 +2929,7 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: { Node *x; OnigDistance min; @@ -2503,61 +2937,64 @@ subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) x = node; do { - ret = subexp_inf_recursive_check(NCONS(x).left, env, head); - if (ret < 0 || ret == RECURSION_INFINITE) return ret; - r |= ret; - if (head) { - ret = get_min_match_length(NCONS(x).left, &min, env); - if (ret != 0) return ret; - if (min != 0) head = 0; - } - } while (IS_NOT_NULL(x = NCONS(x).right)); + ret = subexp_inf_recursive_check(NCAR(x), env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r |= ret; + if (head) { + ret = get_min_match_length(NCAR(x), &min, env); + if (ret != 0) return ret; + if (min != 0) head = 0; + } + } while (IS_NOT_NULL(x = NCDR(x))); } break; - case N_ALT: + case NT_ALT: { int ret; r = RECURSION_EXIST; do { - ret = subexp_inf_recursive_check(NCONS(node).left, env, head); - if (ret < 0 || ret == RECURSION_INFINITE) return ret; - r &= ret; - } while (IS_NOT_NULL(node = NCONS(node).right)); + ret = subexp_inf_recursive_check(NCAR(node), env, head); + if (ret < 0 || ret == RECURSION_INFINITE) return ret; + r &= ret; + } while (IS_NOT_NULL(node = NCDR(node))); } break; - case N_QUALIFIER: - r = subexp_inf_recursive_check(NQUALIFIER(node).target, env, head); + case NT_QTFR: + r = subexp_inf_recursive_check(NQTFR(node)->target, env, head); + if (r == RECURSION_EXIST) { + if (NQTFR(node)->lower == 0) r = 0; + } break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: - r = subexp_inf_recursive_check(an->target, env, head); - break; + r = subexp_inf_recursive_check(an->target, env, head); + break; } } break; - case N_CALL: - r = subexp_inf_recursive_check(NCALL(node).target, env, head); + case NT_CALL: + r = subexp_inf_recursive_check(NCALL(node)->target, env, head); break; - case N_EFFECT: - if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + case NT_ENCLOSE: + if (IS_ENCLOSE_MARK2(NENCLOSE(node))) return 0; - else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + else if (IS_ENCLOSE_MARK1(NENCLOSE(node))) return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE); else { - SET_EFFECT_STATUS(node, NST_MARK2); - r = subexp_inf_recursive_check(NEFFECT(node).target, env, head); - CLEAR_EFFECT_STATUS(node, NST_MARK2); + SET_ENCLOSE_STATUS(node, NST_MARK2); + r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head); + CLEAR_ENCLOSE_STATUS(node, NST_MARK2); } break; @@ -2576,40 +3013,40 @@ subexp_inf_recursive_check_trav(Node* node, ScanEnv* env) type = NTYPE(node); switch (type) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: do { - r = subexp_inf_recursive_check_trav(NCONS(node).left, env); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = subexp_inf_recursive_check_trav(NCAR(node), env); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = subexp_inf_recursive_check_trav(NQUALIFIER(node).target, env); + case NT_QTFR: + r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env); break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: - r = subexp_inf_recursive_check_trav(an->target, env); - break; + r = subexp_inf_recursive_check_trav(an->target, env); + break; } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); - if (IS_EFFECT_RECURSION(en)) { - SET_EFFECT_STATUS(node, NST_MARK1); - r = subexp_inf_recursive_check(en->target, env, 1); - if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION; - CLEAR_EFFECT_STATUS(node, NST_MARK1); + if (IS_ENCLOSE_RECURSION(en)) { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = subexp_inf_recursive_check(en->target, env, 1); + if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION; + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); } r = subexp_inf_recursive_check_trav(en->target, env); } @@ -2626,50 +3063,48 @@ subexp_inf_recursive_check_trav(Node* node, ScanEnv* env) static int subexp_recursive_check(Node* node) { - int type; int r = 0; - type = NTYPE(node); - switch (type) { - case N_LIST: - case N_ALT: + switch (NTYPE(node)) { + case NT_LIST: + case NT_ALT: do { - r |= subexp_recursive_check(NCONS(node).left); - } while (IS_NOT_NULL(node = NCONS(node).right)); + r |= subexp_recursive_check(NCAR(node)); + } while (IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = subexp_recursive_check(NQUALIFIER(node).target); + case NT_QTFR: + r = subexp_recursive_check(NQTFR(node)->target); break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: - r = subexp_recursive_check(an->target); - break; + r = subexp_recursive_check(an->target); + break; } } break; - case N_CALL: - r = subexp_recursive_check(NCALL(node).target); + case NT_CALL: + r = subexp_recursive_check(NCALL(node)->target); if (r != 0) SET_CALL_RECURSION(node); break; - case N_EFFECT: - if (IS_EFFECT_MARK2(&(NEFFECT(node)))) + case NT_ENCLOSE: + if (IS_ENCLOSE_MARK2(NENCLOSE(node))) return 0; - else if (IS_EFFECT_MARK1(&(NEFFECT(node)))) + else if (IS_ENCLOSE_MARK1(NENCLOSE(node))) return 1; /* recursion */ else { - SET_EFFECT_STATUS(node, NST_MARK2); - r = subexp_recursive_check(NEFFECT(node).target); - CLEAR_EFFECT_STATUS(node, NST_MARK2); + SET_ENCLOSE_STATUS(node, NST_MARK2); + r = subexp_recursive_check(NENCLOSE(node)->target); + CLEAR_ENCLOSE_STATUS(node, NST_MARK2); } break; @@ -2684,62 +3119,62 @@ subexp_recursive_check(Node* node) static int subexp_recursive_check_trav(Node* node, ScanEnv* env) { -#define FOUND_CALLED_NODE 1 +# define FOUND_CALLED_NODE 1 int type; int r = 0; type = NTYPE(node); switch (type) { - case N_LIST: - case N_ALT: + case NT_LIST: + case NT_ALT: { int ret; do { - ret = subexp_recursive_check_trav(NCONS(node).left, env); - if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE; - else if (ret < 0) return ret; - } while (IS_NOT_NULL(node = NCONS(node).right)); + ret = subexp_recursive_check_trav(NCAR(node), env); + if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE; + else if (ret < 0) return ret; + } while (IS_NOT_NULL(node = NCDR(node))); } break; - case N_QUALIFIER: - r = subexp_recursive_check_trav(NQUALIFIER(node).target, env); - if (NQUALIFIER(node).upper == 0) { + case NT_QTFR: + r = subexp_recursive_check_trav(NQTFR(node)->target, env); + if (NQTFR(node)->upper == 0) { if (r == FOUND_CALLED_NODE) - NQUALIFIER(node).is_refered = 1; + NQTFR(node)->is_referred = 1; } break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: - r = subexp_recursive_check_trav(an->target, env); - break; + r = subexp_recursive_check_trav(an->target, env); + break; } } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); - - if (! IS_EFFECT_RECURSION(en)) { - if (IS_EFFECT_CALLED(en)) { - SET_EFFECT_STATUS(node, NST_MARK1); - r = subexp_recursive_check(en->target); - if (r != 0) SET_EFFECT_STATUS(node, NST_RECURSION); - CLEAR_EFFECT_STATUS(node, NST_MARK1); - } + EncloseNode* en = NENCLOSE(node); + + if (! IS_ENCLOSE_RECURSION(en)) { + if (IS_ENCLOSE_CALLED(en)) { + SET_ENCLOSE_STATUS(node, NST_MARK1); + r = subexp_recursive_check(en->target); + if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION); + CLEAR_ENCLOSE_STATUS(node, NST_MARK1); + } } r = subexp_recursive_check_trav(en->target, env); - if (IS_EFFECT_CALLED(en)) - r |= FOUND_CALLED_NODE; + if (IS_ENCLOSE_CALLED(en)) + r |= FOUND_CALLED_NODE; } break; @@ -2758,93 +3193,101 @@ setup_subexp_call(Node* node, ScanEnv* env) type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: do { - r = setup_subexp_call(NCONS(node).left, env); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = setup_subexp_call(NCAR(node), env); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_ALT: + case NT_ALT: do { - r = setup_subexp_call(NCONS(node).left, env); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = setup_subexp_call(NCAR(node), env); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_QUALIFIER: - r = setup_subexp_call(NQUALIFIER(node).target, env); + case NT_QTFR: + r = setup_subexp_call(NQTFR(node)->target, env); break; - case N_EFFECT: - r = setup_subexp_call(NEFFECT(node).target, env); + case NT_ENCLOSE: + r = setup_subexp_call(NENCLOSE(node)->target, env); break; - case N_CALL: + case NT_CALL: { - int n, num, *refs; - UChar *p; - CallNode* cn = &(NCALL(node)); + CallNode* cn = NCALL(node); Node** nodes = SCANENV_MEM_NODES(env); -#ifdef USE_NAMED_GROUP - n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs); -#else - n = -1; -#endif - if (n <= 0) { - /* name not found, check group number. (?*ddd) */ - p = cn->name; - num = onig_scan_unsigned_number(&p, cn->name_end, env->enc); - if (num <= 0 || p != cn->name_end) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); - return ONIGERR_UNDEFINED_NAME_REFERENCE; - } -#ifdef USE_NAMED_GROUP - if (env->num_named > 0 && - IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && - !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { - return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; - } -#endif - if (num > env->num_mem) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); - return ONIGERR_UNDEFINED_GROUP_REFERENCE; - } - cn->ref_num = num; - goto set_call_attr; + if (cn->group_num != 0) { + int gnum = cn->group_num; + +# ifdef USE_NAMED_GROUP + if (env->num_named > 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + } +# endif + if (gnum > env->num_mem) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_GROUP_REFERENCE; + } + +# ifdef USE_NAMED_GROUP + set_call_attr: +# endif + cn->target = nodes[cn->group_num]; + if (IS_NULL(cn->target)) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + SET_ENCLOSE_STATUS(cn->target, NST_CALLED); + BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num); + cn->unset_addr_list = env->unset_addr_list; } - else if (n > 1) { - onig_scan_env_set_error_string(env, - ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); - return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; +# ifdef USE_NAMED_GROUP +# ifdef USE_PERL_SUBEXP_CALL + else if (cn->name == cn->name_end) { + goto set_call_attr; } +# endif else { - cn->ref_num = refs[0]; - set_call_attr: - cn->target = nodes[cn->ref_num]; - if (IS_NULL(cn->target)) { - onig_scan_env_set_error_string(env, - ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); - return ONIGERR_UNDEFINED_NAME_REFERENCE; - } - SET_EFFECT_STATUS(cn->target, NST_CALLED); - BIT_STATUS_ON_AT(env->bt_mem_start, cn->ref_num); - cn->unset_addr_list = env->unset_addr_list; + int *refs; + + int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, + &refs); + if (n <= 0) { + onig_scan_env_set_error_string(env, + ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); + return ONIGERR_UNDEFINED_NAME_REFERENCE; + } + else if (n > 1 && + ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) { + onig_scan_env_set_error_string(env, + ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); + return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; + } + else { + cn->group_num = refs[0]; + goto set_call_attr; + } } +# endif } break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: - r = setup_subexp_call(an->target, env); - break; + r = setup_subexp_call(an->target, env); + break; } } break; @@ -2857,6 +3300,14 @@ setup_subexp_call(Node* node, ScanEnv* env) } #endif +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) +#define IN_CALL (1<<4) +#define IN_RECCALL (1<<5) +#define IN_LOOK_BEHIND (1<<6) + /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) (?<!A|B) ==> (?<!A)(?<!B) @@ -2864,30 +3315,29 @@ setup_subexp_call(Node* node, ScanEnv* env) static int divide_look_behind_alternatives(Node* node) { - Node tmp_node; Node *head, *np, *insert_node; - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); int anc_type = an->type; head = an->target; - np = NCONS(head).left; - tmp_node = *node; *node = *head; *head = tmp_node; - NCONS(node).left = head; - NANCHOR(head).target = np; + np = NCAR(head); + swap_node(node, head); + NCAR(node) = head; + NANCHOR(head)->target = np; np = node; - while ((np = NCONS(np).right) != NULL_NODE) { + while ((np = NCDR(np)) != NULL_NODE) { insert_node = onig_node_new_anchor(anc_type); - CHECK_NULL_RETURN_VAL(insert_node, ONIGERR_MEMORY); - NANCHOR(insert_node).target = NCONS(np).left; - NCONS(np).left = insert_node; + CHECK_NULL_RETURN_MEMERR(insert_node); + NANCHOR(insert_node)->target = NCAR(np); + NCAR(np) = insert_node; } if (anc_type == ANCHOR_LOOK_BEHIND_NOT) { np = node; do { - np->type = N_LIST; /* alt -> list */ - } while ((np = NCONS(np).right) != NULL_NODE); + SET_NTYPE(np, NT_LIST); /* alt -> list */ + } while ((np = NCDR(np)) != NULL_NODE); } return 0; } @@ -2896,7 +3346,7 @@ static int setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); r = get_char_length_tree(an->target, reg, &len); if (r == 0) @@ -2920,35 +3370,39 @@ next_setup(Node* node, Node* next_node, regex_t* reg) retry: type = NTYPE(node); - if (type == N_QUALIFIER) { - QualifierNode* qn = &(NQUALIFIER(node)); + if (type == NT_QTFR) { + QtfrNode* qn = NQTFR(node); if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { -#ifdef USE_QUALIFIER_PEEK_NEXT - qn->next_head_exact = get_head_value_node(next_node, 1, reg); +#ifdef USE_QTFR_PEEK_NEXT + Node* n = get_head_value_node(next_node, 1, reg); + /* '\0': for UTF-16BE etc... */ + if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') { + qn->next_head_exact = n; + } #endif - /* automatic posseivation a*b ==> (?>a*)b */ + /* automatic possessification a*b ==> (?>a*)b */ if (qn->lower <= 1) { - int ttype = NTYPE(qn->target); - if (IS_NODE_TYPE_SIMPLE(ttype)) { - Node *x, *y; - x = get_head_value_node(qn->target, 0, reg); - if (IS_NOT_NULL(x)) { - y = get_head_value_node(next_node, 0, reg); - if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { - Node* en = onig_node_new_effect(EFFECT_STOP_BACKTRACK); - CHECK_NULL_RETURN_VAL(en, ONIGERR_MEMORY); - SET_EFFECT_STATUS(en, NST_SIMPLE_REPEAT); - swap_node(node, en); - NEFFECT(node).target = en; - } - } - } + int ttype = NTYPE(qn->target); + if (IS_NODE_TYPE_SIMPLE(ttype)) { + Node *x, *y; + x = get_head_value_node(qn->target, 0, reg); + if (IS_NOT_NULL(x)) { + y = get_head_value_node(next_node, 0, reg); + if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { + Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK); + CHECK_NULL_RETURN_MEMERR(en); + SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); + swap_node(node, en); + NENCLOSE(node)->target = en; + } + } + } } } } - else if (type == N_EFFECT) { - EffectNode* en = &(NEFFECT(node)); - if (en->type == EFFECT_MEMORY) { + else if (type == NT_ENCLOSE) { + EncloseNode* en = NENCLOSE(node); + if (en->type == ENCLOSE_MEMORY && !IS_ENCLOSE_CALLED(en)) { node = en->target; goto retry; } @@ -2956,9 +3410,496 @@ next_setup(Node* node, Node* next_node, regex_t* reg) return 0; } -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) + +static int +update_string_node_case_fold(regex_t* reg, Node *node) +{ + UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *sbuf, *ebuf, *sp; + int r, i, len; + OnigDistance sbuf_size; + StrNode* sn = NSTR(node); + + end = sn->end; + sbuf_size = (end - sn->s) * 2; + sbuf = (UChar* )xmalloc(sbuf_size); + CHECK_NULL_RETURN_MEMERR(sbuf); + ebuf = sbuf + sbuf_size; + + sp = sbuf; + p = sn->s; + while (p < end) { + len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); + for (i = 0; i < len; i++) { + if (sp >= ebuf) { + UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2); + if (IS_NULL(p)) { + xfree(sbuf); + return ONIGERR_MEMORY; + } + sbuf = p; + sp = sbuf + sbuf_size; + sbuf_size *= 2; + ebuf = sbuf + sbuf_size; + } + + *sp++ = buf[i]; + } + } + + r = onig_node_str_set(node, sbuf, sp); + + xfree(sbuf); + return r; +} + +static int +expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, + regex_t* reg) +{ + int r; + Node *node; + + node = onig_node_new_str(s, end); + if (IS_NULL(node)) return ONIGERR_MEMORY; + + r = update_string_node_case_fold(reg, node); + if (r != 0) { + onig_node_free(node); + return r; + } + + NSTRING_SET_AMBIG(node); + NSTRING_SET_DONT_GET_OPT_INFO(node); + *rnode = node; + return 0; +} + +static int +is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[], + int slen) +{ + int i; + + for (i = 0; i < item_num; i++) { + if (items[i].byte_len != slen) { + return 1; + } + if (items[i].code_len != 1) { + return 1; + } + } + return 0; +} + +static int +expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], + UChar *p, int slen, UChar *end, + regex_t* reg, Node **rnode) +{ + int r, i, j, len, varlen; + Node *anode, *var_anode, *snode, *xnode, *an; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + + *rnode = var_anode = NULL_NODE; + + varlen = 0; + for (i = 0; i < item_num; i++) { + if (items[i].byte_len != slen) { + varlen = 1; + break; + } + } + + if (varlen != 0) { + *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); + if (IS_NULL(var_anode)) return ONIGERR_MEMORY; + + xnode = onig_node_new_list(NULL, NULL); + if (IS_NULL(xnode)) goto mem_err; + NCAR(var_anode) = xnode; + + anode = onig_node_new_alt(NULL_NODE, NULL_NODE); + if (IS_NULL(anode)) goto mem_err; + NCAR(xnode) = anode; + } + else { + *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); + if (IS_NULL(anode)) return ONIGERR_MEMORY; + } + + snode = onig_node_new_str(p, p + slen); + if (IS_NULL(snode)) goto mem_err; + + NCAR(anode) = snode; + + for (i = 0; i < item_num; i++) { + snode = onig_node_new_str(NULL, NULL); + if (IS_NULL(snode)) goto mem_err; + + for (j = 0; j < items[i].code_len; j++) { + len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); + if (len < 0) { + r = len; + goto mem_err2; + } + + r = onig_node_str_cat(snode, buf, buf + len); + if (r != 0) goto mem_err2; + } + + an = onig_node_new_alt(NULL_NODE, NULL_NODE); + if (IS_NULL(an)) { + goto mem_err2; + } + + if (items[i].byte_len != slen) { + Node *rem; + UChar *q = p + items[i].byte_len; + + if (q < end) { + r = expand_case_fold_make_rem_string(&rem, q, end, reg); + if (r != 0) { + onig_node_free(an); + goto mem_err2; + } + + xnode = onig_node_list_add(NULL_NODE, snode); + if (IS_NULL(xnode)) { + onig_node_free(an); + onig_node_free(rem); + goto mem_err2; + } + if (IS_NULL(onig_node_list_add(xnode, rem))) { + onig_node_free(an); + onig_node_free(xnode); + onig_node_free(rem); + goto mem_err; + } + + NCAR(an) = xnode; + } + else { + NCAR(an) = snode; + } + + NCDR(var_anode) = an; + var_anode = an; + } + else { + NCAR(an) = snode; + NCDR(anode) = an; + anode = an; + } + } + + return varlen; + + mem_err2: + onig_node_free(snode); + + mem_err: + onig_node_free(*rnode); + + return ONIGERR_MEMORY; +} + +#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 + +static int +expand_case_fold_string(Node* node, regex_t* reg, int state) +{ + int r, n, len, alt_num; + int varlen = 0; + int is_in_look_behind; + UChar *start, *end, *p; + Node *top_root, *root, *snode, *prev_node; + OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; + StrNode* sn; + + if (NSTRING_IS_AMBIG(node)) return 0; + + sn = NSTR(node); + + start = sn->s; + end = sn->end; + if (start >= end) return 0; + + is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + + r = 0; + top_root = root = prev_node = snode = NULL_NODE; + alt_num = 1; + p = start; + while (p < end) { + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, + p, end, items); + if (n < 0) { + r = n; + goto err; + } + + len = enclen(reg->enc, p, end); + + varlen = is_case_fold_variable_len(n, items, len); + if (n == 0 || varlen == 0 || is_in_look_behind) { + if (IS_NULL(snode)) { + if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { + onig_node_free(top_root); + top_root = root = onig_node_list_add(NULL_NODE, prev_node); + if (IS_NULL(root)) { + onig_node_free(prev_node); + goto mem_err; + } + } + + prev_node = snode = onig_node_new_str(NULL, NULL); + if (IS_NULL(snode)) goto mem_err; + if (IS_NOT_NULL(root)) { + if (IS_NULL(onig_node_list_add(root, snode))) { + onig_node_free(snode); + goto mem_err; + } + } + } + + r = onig_node_str_cat(snode, p, p + len); + if (r != 0) goto err; + } + else { + alt_num *= (n + 1); + if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; + + if (IS_NOT_NULL(snode)) { + r = update_string_node_case_fold(reg, snode); + if (r == 0) { + NSTRING_SET_AMBIG(snode); + } + } + if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { + onig_node_free(top_root); + top_root = root = onig_node_list_add(NULL_NODE, prev_node); + if (IS_NULL(root)) { + onig_node_free(prev_node); + goto mem_err; + } + } + + r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); + if (r < 0) goto mem_err; + if (r == 1) { + if (IS_NULL(root)) { + top_root = prev_node; + } + else { + if (IS_NULL(onig_node_list_add(root, prev_node))) { + onig_node_free(prev_node); + goto mem_err; + } + } + + root = NCAR(prev_node); + } + else { /* r == 0 */ + if (IS_NOT_NULL(root)) { + if (IS_NULL(onig_node_list_add(root, prev_node))) { + onig_node_free(prev_node); + goto mem_err; + } + } + } + + snode = NULL_NODE; + } + + p += len; + } + if (IS_NOT_NULL(snode)) { + r = update_string_node_case_fold(reg, snode); + if (r == 0) { + NSTRING_SET_AMBIG(snode); + } + } + + if (p < end) { + Node *srem; + + r = expand_case_fold_make_rem_string(&srem, p, end, reg); + if (r != 0) goto mem_err; + + if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { + onig_node_free(top_root); + top_root = root = onig_node_list_add(NULL_NODE, prev_node); + if (IS_NULL(root)) { + onig_node_free(srem); + onig_node_free(prev_node); + goto mem_err; + } + } + + if (IS_NULL(root)) { + prev_node = srem; + } + else { + if (IS_NULL(onig_node_list_add(root, srem))) { + onig_node_free(srem); + goto mem_err; + } + } + } + + /* ending */ + top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); + swap_node(node, top_root); + onig_node_free(top_root); + return 0; + + mem_err: + r = ONIGERR_MEMORY; + + err: + onig_node_free(top_root); + return r; +} + + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +# define CEC_THRES_NUM_BIG_REPEAT 512 +# define CEC_INFINITE_NUM 0x7fffffff + +# define CEC_IN_INFINITE_REPEAT (1<<0) +# define CEC_IN_FINITE_REPEAT (1<<1) +# define CEC_CONT_BIG_REPEAT (1<<2) + +static int +setup_comb_exp_check(Node* node, int state, ScanEnv* env) +{ + int type; + int r = state; + + type = NTYPE(node); + switch (type) { + case NT_LIST: + { + do { + r = setup_comb_exp_check(NCAR(node), r, env); + } while (r >= 0 && IS_NOT_NULL(node = NCDR(node))); + } + break; + + case NT_ALT: + { + int ret; + do { + ret = setup_comb_exp_check(NCAR(node), state, env); + r |= ret; + } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node))); + } + break; + + case NT_QTFR: + { + int child_state = state; + int add_state = 0; + QtfrNode* qn = NQTFR(node); + Node* target = qn->target; + int var_num; + + if (! IS_REPEAT_INFINITE(qn->upper)) { + if (qn->upper > 1) { + /* {0,1}, {1,1} are allowed */ + child_state |= CEC_IN_FINITE_REPEAT; + + /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */ + if (env->backrefed_mem == 0) { + if (NTYPE(qn->target) == NT_ENCLOSE) { + EncloseNode* en = NENCLOSE(qn->target); + if (en->type == ENCLOSE_MEMORY) { + if (NTYPE(en->target) == NT_QTFR) { + QtfrNode* q = NQTFR(en->target); + if (IS_REPEAT_INFINITE(q->upper) + && q->greedy == qn->greedy) { + qn->upper = (qn->lower == 0 ? 1 : qn->lower); + if (qn->upper == 1) + child_state = state; + } + } + } + } + } + } + } + + if (state & CEC_IN_FINITE_REPEAT) { + qn->comb_exp_check_num = -1; + } + else { + if (IS_REPEAT_INFINITE(qn->upper)) { + var_num = CEC_INFINITE_NUM; + child_state |= CEC_IN_INFINITE_REPEAT; + } + else { + var_num = qn->upper - qn->lower; + } + + if (var_num >= CEC_THRES_NUM_BIG_REPEAT) + add_state |= CEC_CONT_BIG_REPEAT; + + if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) || + ((state & CEC_CONT_BIG_REPEAT) != 0 && + var_num >= CEC_THRES_NUM_BIG_REPEAT)) { + if (qn->comb_exp_check_num == 0) { + env->num_comb_exp_check++; + qn->comb_exp_check_num = env->num_comb_exp_check; + if (env->curr_max_regnum > env->comb_exp_max_regnum) + env->comb_exp_max_regnum = env->curr_max_regnum; + } + } + } + + r = setup_comb_exp_check(target, child_state, env); + r |= add_state; + } + break; + + case NT_ENCLOSE: + { + EncloseNode* en = NENCLOSE(node); + + switch (en->type) { + case ENCLOSE_MEMORY: + { + if (env->curr_max_regnum < en->regnum) + env->curr_max_regnum = en->regnum; + + r = setup_comb_exp_check(en->target, state, env); + } + break; + + default: + r = setup_comb_exp_check(en->target, state, env); + break; + } + } + break; + +# ifdef USE_SUBEXP_CALL + case NT_CALL: + if (IS_CALL_RECURSION(NCALL(node))) + env->has_recursion = 1; + else + r = setup_comb_exp_check(NCALL(node)->target, state, env); + break; +# endif + + default: + break; + } + + return r; +} +#endif /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) @@ -2974,247 +3915,295 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) int type; int r = 0; +restart: type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: { Node* prev = NULL_NODE; do { - r = setup_tree(NCONS(node).left, reg, state, env); - if (IS_NOT_NULL(prev) && r == 0) { - r = next_setup(prev, NCONS(node).left, reg); - } - prev = NCONS(node).left; - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = setup_tree(NCAR(node), reg, state, env); + if (IS_NOT_NULL(prev) && r == 0) { + r = next_setup(prev, NCAR(node), reg); + } + prev = NCAR(node); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); } break; - case N_ALT: + case NT_ALT: do { - r = setup_tree(NCONS(node).left, reg, (state | IN_ALT), env); - } while (r == 0 && IS_NOT_NULL(node = NCONS(node).right)); + r = setup_tree(NCAR(node), reg, (state | IN_ALT), env); + } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; - case N_CCLASS: - if (IS_IGNORECASE(reg->options)) { - int i; - UChar c, lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - BitSetRef bs = NCCLASS(node).bs; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - c = (UChar )i; - ONIGENC_MBC_TO_LOWER(reg->enc, &c, lowbuf); - if (*lowbuf != c) { - if (BITSET_AT(bs, c)) BITSET_SET_BIT(bs, *lowbuf); - if (BITSET_AT(bs, *lowbuf)) BITSET_SET_BIT(bs, c); - } - } - } + case NT_CCLASS: break; - case N_STRING: + case NT_STR: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - StrNode* sn = &NSTRING(node); - UChar* p = sn->s; - - while (p < sn->end) { - if (ONIGENC_IS_MBC_CASE_AMBIG(reg->enc, p)) { - NSTRING_SET_CASE_AMBIG(node); - break; - } - p += enc_len(reg->enc, *p); - } + r = expand_case_fold_string(node, reg, state); } break; - case N_CTYPE: - case N_ANYCHAR: + case NT_CTYPE: + case NT_CANY: break; #ifdef USE_SUBEXP_CALL - case N_CALL: + case NT_CALL: break; #endif - case N_BACKREF: + case NT_BREF: { int i; int* p; Node** nodes = SCANENV_MEM_NODES(env); - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); p = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { - if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; - BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); - BIT_STATUS_ON_AT(env->bt_mem_start, p[i]); - SET_EFFECT_STATUS(nodes[p[i]], NST_MEM_BACKREFED); + if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; + BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); + BIT_STATUS_ON_AT(env->bt_mem_start, p[i]); +#ifdef USE_BACKREF_WITH_LEVEL + if (IS_BACKREF_NEST_LEVEL(br)) { + BIT_STATUS_ON_AT(env->bt_mem_end, p[i]); + } +#endif + SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED); } } break; - case N_QUALIFIER: + case NT_QTFR: { OnigDistance d; - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); Node* target = qn->target; + if ((state & IN_REPEAT) != 0) { + qn->state |= NST_IN_REPEAT; + } + if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { - r = get_min_match_length(target, &d, env); - if (r) break; - if (d == 0) { - qn->target_empty_info = NQ_TARGET_IS_EMPTY; -#ifdef USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK - r = qualifiers_memory_node_info(target); - if (r < 0) break; - if (r > 0) { - qn->target_empty_info = r; - } + r = get_min_match_length(target, &d, env); + if (r) break; + if (d == 0) { + qn->target_empty_info = NQ_TARGET_IS_EMPTY; +#ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT + r = quantifiers_memory_node_info(target); + if (r < 0) break; + if (r > 0) { + qn->target_empty_info = r; + } #endif #if 0 - r = get_max_match_length(target, &d, env); - if (r == 0 && d == 0) { - /* ()* ==> ()?, ()+ ==> () */ - qn->upper = 1; - if (qn->lower > 1) qn->lower = 1; - if (NTYPE(target) == N_STRING) { - qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */ - } - } + r = get_max_match_length(target, &d, env); + if (r == 0 && d == 0) { + /* ()* ==> ()?, ()+ ==> () */ + qn->upper = 1; + if (qn->lower > 1) qn->lower = 1; + if (NTYPE(target) == NT_STR) { + qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */ + } + } #endif - } + } } + state |= IN_REPEAT; if (qn->lower != qn->upper) - state |= IN_REPEAT; + state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 - if (NTYPE(target) == N_STRING) { - if (!IS_REPEAT_INFINITE(qn->lower) && qn->lower == qn->upper && - qn->lower > 1 && qn->lower <= EXPAND_STRING_MAX_LENGTH) { - int len = NSTRING_LEN(target); - StrNode* sn = &(NSTRING(target)); - - if (len * qn->lower <= EXPAND_STRING_MAX_LENGTH) { - int i, n = qn->lower; - onig_node_conv_to_str_node(node, NSTRING(target).flag); - for (i = 0; i < n; i++) { - r = onig_node_str_cat(node, sn->s, sn->end); - if (r) break; - } - onig_node_free(target); - break; /* break case N_QUALIFIER: */ - } - } + if (NTYPE(target) == NT_STR) { + if (qn->lower > 1) { + int i, n = qn->lower; + OnigDistance len = NSTRING_LEN(target); + StrNode* sn = NSTR(target); + Node* np; + + np = onig_node_new_str(sn->s, sn->end); + if (IS_NULL(np)) return ONIGERR_MEMORY; + NSTR(np)->flag = sn->flag; + + for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) { + r = onig_node_str_cat(np, sn->s, sn->end); + if (r) { + onig_node_free(np); + return r; + } + } + if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) { + Node *np1, *np2; + + qn->lower -= i; + if (! IS_REPEAT_INFINITE(qn->upper)) + qn->upper -= i; + + np1 = onig_node_new_list(np, NULL); + if (IS_NULL(np1)) { + onig_node_free(np); + return ONIGERR_MEMORY; + } + swap_node(np1, node); + np2 = onig_node_list_add(node, np1); + if (IS_NULL(np2)) { + onig_node_free(np1); + return ONIGERR_MEMORY; + } + } + else { + swap_node(np, node); + onig_node_free(np); + } + break; /* break case NT_QTFR: */ + } } #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (qn->greedy && (qn->target_empty_info != 0)) { - if (NTYPE(target) == N_QUALIFIER) { - QualifierNode* tqn = &(NQUALIFIER(target)); - if (IS_NOT_NULL(tqn->head_exact)) { - qn->head_exact = tqn->head_exact; - tqn->head_exact = NULL; - } - } - else { - qn->head_exact = get_head_value_node(qn->target, 1, reg); - } + if (NTYPE(target) == NT_QTFR) { + QtfrNode* tqn = NQTFR(target); + if (IS_NOT_NULL(tqn->head_exact)) { + qn->head_exact = tqn->head_exact; + tqn->head_exact = NULL; + } + } + else { + qn->head_exact = get_head_value_node(qn->target, 1, reg); + } } #endif } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_OPTION: - { - OnigOptionType options = reg->options; - reg->options = NEFFECT(node).option; - r = setup_tree(NEFFECT(node).target, reg, state, env); - reg->options = options; - } - break; + case ENCLOSE_OPTION: + { + OnigOptionType options = reg->options; + reg->options = NENCLOSE(node)->option; + r = setup_tree(NENCLOSE(node)->target, reg, state, env); + reg->options = options; + } + break; - case EFFECT_MEMORY: - if ((state & (IN_ALT | IN_NOT | IN_REPEAT)) != 0) { - BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); - /* SET_EFFECT_STATUS(node, NST_MEM_IN_ALT_NOT); */ - } - /* fall */ - case EFFECT_STOP_BACKTRACK: - { - Node* target = en->target; - r = setup_tree(target, reg, state, env); - if (NTYPE(target) == N_QUALIFIER) { - QualifierNode* tqn = &(NQUALIFIER(target)); - if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && - tqn->greedy != 0) { /* (?>a*), a*+ etc... */ - int qtype = NTYPE(tqn->target); - if (IS_NODE_TYPE_SIMPLE(qtype)) - SET_EFFECT_STATUS(node, NST_SIMPLE_REPEAT); - } - } - } - break; + case ENCLOSE_MEMORY: + if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) { + BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); + /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */ + } + if (IS_ENCLOSE_CALLED(en)) + state |= IN_CALL; + if (IS_ENCLOSE_RECURSION(en)) + state |= IN_RECCALL; + else if ((state & IN_RECCALL) != 0) + SET_CALL_RECURSION(node); + r = setup_tree(en->target, reg, state, env); + break; + + case ENCLOSE_STOP_BACKTRACK: + { + Node* target = en->target; + r = setup_tree(target, reg, state, env); + if (NTYPE(target) == NT_QTFR) { + QtfrNode* tqn = NQTFR(target); + if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && + tqn->greedy != 0) { /* (?>a*), a*+ etc... */ + int qtype = NTYPE(tqn->target); + if (IS_NODE_TYPE_SIMPLE(qtype)) + SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); + } + } + } + break; + + case ENCLOSE_CONDITION: +#ifdef USE_NAMED_GROUP + if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) && + env->num_named > 0 && + IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && + !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; + } +#endif + if (NENCLOSE(node)->regnum > env->num_mem) + return ONIGERR_INVALID_BACKREF; + r = setup_tree(NENCLOSE(node)->target, reg, state, env); + break; + + case ENCLOSE_ABSENT: + r = setup_tree(NENCLOSE(node)->target, reg, state, env); + break; } } break; - case N_ANCHOR: + case NT_ANCHOR: { - AnchorNode* an = &(NANCHOR(node)); + AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: - r = setup_tree(an->target, reg, state, env); - break; + r = setup_tree(an->target, reg, state, env); + break; case ANCHOR_PREC_READ_NOT: - r = setup_tree(an->target, reg, (state | IN_NOT), env); - break; + r = setup_tree(an->target, reg, (state | IN_NOT), env); + break; /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ - ( N_LIST | N_ALT | N_STRING | N_CCLASS | N_CTYPE | \ - N_ANYCHAR | N_ANCHOR | N_EFFECT | N_QUALIFIER | N_CALL ) + ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \ + BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL ) -#define ALLOWED_EFFECT_IN_LB ( EFFECT_MEMORY ) -#define ALLOWED_EFFECT_IN_LB_NOT 0 +#define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION ) +#define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION #define ALLOWED_ANCHOR_IN_LB \ -( ANCHOR_LOOK_BEHIND | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) +( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ + ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ + ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ + ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) #define ALLOWED_ANCHOR_IN_LB_NOT \ -( ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | ANCHOR_END_LINE | ANCHOR_BEGIN_BUF ) - /* can't allow all anchors, because \G in look-behind through Search(). - ex. /(?<=\G)zz/.match("azz") => success. */ +( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ + ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ + ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ + ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) case ANCHOR_LOOK_BEHIND: - { - r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, - ALLOWED_EFFECT_IN_LB, ALLOWED_ANCHOR_IN_LB); - if (r < 0) return r; - if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; - r = setup_tree(an->target, reg, state, env); - } - break; + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + if (NTYPE(node) != NT_ANCHOR) goto restart; + r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); + } + break; case ANCHOR_LOOK_BEHIND_NOT: - { - r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, - ALLOWED_EFFECT_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); - if (r < 0) return r; - if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; - r = setup_look_behind(node, reg, env); - if (r != 0) return r; - r = setup_tree(an->target, reg, (state | IN_NOT), env); - } - break; + { + r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, + ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); + if (r < 0) return r; + if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; + if (NTYPE(node) != NT_ANCHOR) goto restart; + r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND), + env); + if (r != 0) return r; + r = setup_look_behind(node, reg, env); + } + break; } } break; @@ -3226,51 +4215,73 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) return r; } -/* set skip map for Boyer-Moor search */ +/* set skip map for Sunday's quick search */ static int -set_bm_skip(UChar* s, UChar* end, OnigEncoding enc, int ignore_case, - UChar skip[], int** int_skip) +set_bm_skip(UChar* s, UChar* end, regex_t* reg, + UChar skip[], int ignore_case) { - int i, len; - UChar lowbuf[ONIGENC_MBC_TO_LOWER_MAXLEN]; + OnigDistance i, len; + int clen, flen, n, j, k; + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; + OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; + OnigEncoding enc = reg->enc; len = end - s; - if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = len; + if (len >= ONIG_CHAR_TABLE_SIZE) { + /* This should not happen. */ + return ONIGERR_TYPE_BUG; + } - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - skip[*lowbuf] = len - 1 - i; + if (ignore_case) { + for (i = 0; i < len; i += clen) { + p = s + i; + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); + + for (j = 0; j < n; j++) { + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf); + if (flen != clen) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } } } - else { - for (i = 0; i < len - 1; i++) - skip[s[i]] = len - 1 - i; - } +endcheck: + len = end - s; } - else { - if (IS_NULL(*int_skip)) { - *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); - if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; - } - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = len; - if (ignore_case) { - for (i = 0; i < len - 1; i++) { - ONIGENC_MBC_TO_LOWER(enc, &(s[i]), lowbuf); - (*int_skip)[*lowbuf] = len - 1 - i; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + skip[i] = (UChar )(len + 1); + n = 0; + for (i = 0; i < len; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); + + for (j = 0; j < clen; j++) { + skip[s[i + j]] = (UChar )(len - i - j); + for (k = 0; k < n; k++) { + ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf); + skip[buf[j]] = (UChar )(len - i - j); } } - else { - for (i = 0; i < len - 1; i++) - (*int_skip)[s[i]] = len - 1 - i; - } } - return 0; -} -#define OPT_EXACT_MAXLEN 24 + return (int )len; +} typedef struct { OnigDistance min; /* min byte length */ @@ -3278,11 +4289,11 @@ typedef struct { } MinMaxLen; typedef struct { - MinMaxLen mmd; - BitStatusType backrefed_status; - OnigEncoding enc; - OnigOptionType options; - ScanEnv* scan_env; + MinMaxLen mmd; + OnigEncoding enc; + OnigOptionType options; + OnigCaseFoldType case_fold_flag; + ScanEnv* scan_env; } OptEnv; typedef struct { @@ -3295,7 +4306,7 @@ typedef struct { OptAncInfo anc; int reach_end; - int ignore_case; + int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */ int len; UChar s[OPT_EXACT_MAXLEN]; } OptExactInfo; @@ -3321,49 +4332,54 @@ typedef struct { static int -map_position_value(int i) -{ - static int vals[] = { - 10, 10, 10, 10, 10, 10, 10, 10, 10, 1, 1, 10, 10, 1, 10, 10, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 1, 6, 3, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 5, 5, - 5, 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 10, +map_position_value(OnigEncoding enc, int i) +{ + static const short int ByteValTable[] = { + 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, + 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 }; - if (i < sizeof(vals)/sizeof(vals[0])) return vals[i]; - - return 7; /* Take it easy. */ + if (i < numberof(ByteValTable)) { + if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1) + return 20; + else + return (int )ByteValTable[i]; + } + else + return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ - static int dist_vals[] = { - 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, - 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, - 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, - 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, - 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, - 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, - 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, - 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, - 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, + static const short int dist_vals[] = { + 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, + 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, + 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, + 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, + 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, + 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, + 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, + 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, + 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10 }; - int d; + OnigDistance d; if (mm->max == ONIG_INFINITE_DISTANCE) return 0; d = mm->max - mm->min; - if (d < sizeof(dist_vals)/sizeof(dist_vals[0])) + if (d < numberof(dist_vals)) /* return dist_vals[d] * 16 / (mm->min + 12); */ - return dist_vals[d]; + return (int )dist_vals[d]; else return 1; } @@ -3419,12 +4435,14 @@ add_mml(MinMaxLen* to, MinMaxLen* from) to->max = distance_add(to->max, from->max); } +#if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } +#endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) @@ -3454,7 +4472,7 @@ copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from) static void concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, - OnigDistance left_len, OnigDistance right_len) + OnigDistance left_len, OnigDistance right_len) { clear_opt_anc_info(to); @@ -3467,6 +4485,9 @@ concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, if (right_len == 0) { to->right_anchor |= left->right_anchor; } + else { + to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT); + } } static int @@ -3525,7 +4546,7 @@ clear_opt_exact_info(OptExactInfo* ex) clear_mml(&ex->mmd); clear_opt_anc_info(&ex->anc); ex->reach_end = 0; - ex->ignore_case = 0; + ex->ignore_case = -1; /* unset */ ex->len = 0; ex->s[0] = '\0'; } @@ -3537,22 +4558,28 @@ copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) } static void -concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) +concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc) { - int i, n; + int i, j, len; + UChar *p, *end; OptAncInfo tanc; - if (! to->ignore_case && add->ignore_case) { - if (to->len >= add->len) return ; /* avoid */ - - to->ignore_case = 1; + if (to->ignore_case < 0) + to->ignore_case = add->ignore_case; + else if (to->ignore_case != add->ignore_case) + return ; /* avoid */ + + p = add->s; + end = p + add->len; + for (i = to->len; p < end; ) { + len = enclen(enc, p, end); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) + to->s[i++] = *p++; } - for (i = to->len, n = 0; n < add->len && i < OPT_EXACT_MAXLEN; i++, n++) - to->s[i] = add->s[n]; - to->len = i; - to->reach_end = (n == add->len ? add->reach_end : 0); + to->reach_end = (p == end ? add->reach_end : 0); concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); if (! to->reach_end) tanc.right_anchor = 0; @@ -3560,22 +4587,17 @@ concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) } static void -concat_opt_exact_info_str(OptExactInfo* to, - UChar* s, UChar* end, int raw, OnigEncoding enc) +concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end, + int raw ARG_UNUSED, OnigEncoding enc) { int i, j, len; UChar *p; for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { - if (raw) { + len = enclen(enc, p, end); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) to->s[i++] = *p++; - } - else { - len = enc_len(enc, *p); - if (i + len > OPT_EXACT_MAXLEN) break; - for (j = 0; j < len; j++) - to->s[i++] = *p++; - } } to->len = i; @@ -3598,7 +4620,7 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; - len = enc_len(env->enc, to->s[i]); + len = enclen(env->enc, to->s + i, to->s + to->len); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; @@ -3611,34 +4633,72 @@ alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) to->reach_end = 0; } to->len = i; - to->ignore_case |= add->ignore_case; + if (to->ignore_case < 0) + to->ignore_case = add->ignore_case; + else if (add->ignore_case >= 0) + to->ignore_case |= add->ignore_case; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right_anchor = 0; } static void -select_opt_exact_info(OptExactInfo* now, OptExactInfo* alt) +select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt) { - int vlen1, vlen2; + int v1, v2; - vlen1 = now->len * (now->ignore_case ? 1 : 2); - vlen2 = alt->len * (alt->ignore_case ? 1 : 2); + v1 = now->len; + v2 = alt->len; - if (comp_distance_value(&now->mmd, &alt->mmd, vlen1, vlen2) > 0) + if (v2 == 0) { + return ; + } + else if (v1 == 0) { + copy_opt_exact_info(now, alt); + return ; + } + else if (v1 <= 2 && v2 <= 2) { + /* ByteValTable[x] is big value --> low price */ + v2 = map_position_value(enc, now->s[0]); + v1 = map_position_value(enc, alt->s[0]); + + if (now->len > 1) v1 += 5; + if (alt->len > 1) v2 += 5; + } + + if (now->ignore_case <= 0) v1 *= 2; + if (alt->ignore_case <= 0) v2 *= 2; + + if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } static void clear_opt_map_info(OptMapInfo* map) { - int i; + static const OptMapInfo clean_info = { + {0, 0}, {0, 0}, 0, + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + } + }; - clear_mml(&map->mmd); - clear_opt_anc_info(&map->anc); - map->value = 0; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - map->map[i] = 0; + xmemcpy(map, &clean_info, sizeof(OptMapInfo)); } static void @@ -3648,40 +4708,40 @@ copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) } static void -add_char_opt_map_info(OptMapInfo* map, int c) +add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc) { if (map->map[c] == 0) { map->map[c] = 1; - map->value += map_position_value(c); + map->value += map_position_value(enc, c); } } -static void -add_char_amb_opt_map_info(OptMapInfo* map, int c, OnigEncoding enc) +static int +add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, + OnigEncoding enc, OnigCaseFoldType case_fold_flag) { - UChar x, low[ONIGENC_MBC_TO_LOWER_MAXLEN]; + OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; + UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; + int i, n; - add_char_opt_map_info(map, c); + add_char_opt_map_info(map, p[0], enc); - x = (UChar )c; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if (*low != x) { - add_char_opt_map_info(map, (int )(*low)); - } - else { - int i; - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { - x = (UChar )i; - ONIGENC_MBC_TO_LOWER(enc, &x, low); - if ((int )(*low) == c) add_char_opt_map_info(map, i); - } + case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items); + if (n < 0) return n; + + for (i = 0; i < n; i++) { + ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf); + add_char_opt_map_info(map, buf[0], enc); } + + return 0; } static void select_opt_map_info(OptMapInfo* now, OptMapInfo* alt) { - static int z = 1<<15; /* 32768: something big value */ + const int z = 1<<15; /* 32768: something big value */ int v1, v2; @@ -3705,13 +4765,13 @@ comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m) if (m->value <= 0) return -1; - ve = COMP_EM_BASE * e->len * (e->ignore_case ? 1 : 2); + ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2); vm = COMP_EM_BASE * 5 * 2 / m->value; return comp_distance_value(&e->mmd, &m->mmd, ve, vm); } static void -alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) +alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add) { int i, val; @@ -3730,7 +4790,7 @@ alt_merge_opt_map_info(OptMapInfo* to, OptMapInfo* add) to->map[i] = 1; if (to->map[i]) - val += map_position_value(i); + val += map_position_value(enc, i); } to->value = val; @@ -3763,7 +4823,7 @@ copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from) } static void -concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) +concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) { int exb_reach, exm_reach; OptAncInfo tanc; @@ -3773,7 +4833,7 @@ concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) if (add->exb.len > 0 && to->len.max == 0) { concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc, - to->len.max, add->len.max); + to->len.max, add->len.max); copy_opt_anc_info(&add->exb.anc, &tanc); } @@ -3790,26 +4850,26 @@ concat_left_node_opt_info(NodeOptInfo* to, NodeOptInfo* add) if (add->exb.len > 0) { if (exb_reach) { - concat_opt_exact_info(&to->exb, &add->exb); + concat_opt_exact_info(&to->exb, &add->exb, enc); clear_opt_exact_info(&add->exb); } else if (exm_reach) { - concat_opt_exact_info(&to->exm, &add->exb); + concat_opt_exact_info(&to->exm, &add->exb, enc); clear_opt_exact_info(&add->exb); } } - select_opt_exact_info(&to->exm, &add->exb); - select_opt_exact_info(&to->exm, &add->exm); + select_opt_exact_info(enc, &to->exm, &add->exb); + select_opt_exact_info(enc, &to->exm, &add->exm); if (to->expr.len > 0) { if (add->len.max > 0) { if (to->expr.len > (int )add->len.max) - to->expr.len = add->len.max; + to->expr.len = (int )add->len.max; if (to->expr.mmd.max == 0) - select_opt_exact_info(&to->exb, &to->expr); + select_opt_exact_info(enc, &to->exb, &to->expr); else - select_opt_exact_info(&to->exm, &to->expr); + select_opt_exact_info(enc, &to->exm, &to->expr); } } else if (add->expr.len > 0) { @@ -3828,7 +4888,7 @@ alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env) alt_merge_opt_exact_info(&to->exb, &add->exb, env); alt_merge_opt_exact_info(&to->exm, &add->exm, env); alt_merge_opt_exact_info(&to->expr, &add->expr, env); - alt_merge_opt_map_info (&to->map, &add->map); + alt_merge_opt_map_info(env->enc, &to->map, &add->map); alt_merge_mml(&to->len, &add->len); } @@ -3847,7 +4907,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) type = NTYPE(node); switch (type) { - case N_LIST: + case NT_LIST: { OptEnv nenv; NodeOptInfo nopt; @@ -3855,220 +4915,192 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) copy_opt_env(&nenv, env); do { - r = optimize_node_left(NCONS(nd).left, &nopt, &nenv); - if (r == 0) { - add_mml(&nenv.mmd, &nopt.len); - concat_left_node_opt_info(opt, &nopt); - } - } while (r == 0 && IS_NOT_NULL(nd = NCONS(nd).right)); + r = optimize_node_left(NCAR(nd), &nopt, &nenv); + if (r == 0) { + add_mml(&nenv.mmd, &nopt.len); + concat_left_node_opt_info(env->enc, opt, &nopt); + } + } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd))); } break; - case N_ALT: + case NT_ALT: { NodeOptInfo nopt; Node* nd = node; do { - r = optimize_node_left(NCONS(nd).left, &nopt, env); - if (r == 0) { - if (nd == node) copy_node_opt_info(opt, &nopt); - else alt_merge_node_opt_info(opt, &nopt, env); - } - } while ((r == 0) && IS_NOT_NULL(nd = NCONS(nd).right)); + r = optimize_node_left(NCAR(nd), &nopt, env); + if (r == 0) { + if (nd == node) copy_node_opt_info(opt, &nopt); + else alt_merge_node_opt_info(opt, &nopt, env); + } + } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd))); } break; - case N_STRING: + case NT_STR: { - UChar *p; - int len, plen; - StrNode* sn = &(NSTRING(node)); - int slen = sn->end - sn->s; + StrNode* sn = NSTR(node); + OnigDistance slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); - if ((! IS_IGNORECASE(env->options)) || is_raw) { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - NSTRING_IS_RAW(node), env->enc); - if (slen > 0) { - add_char_opt_map_info(&opt->map, *(sn->s)); - } + if (! NSTRING_IS_AMBIG(node)) { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 0; + if (slen > 0) { + add_char_opt_map_info(&opt->map, *(sn->s), env->enc); + } + set_mml(&opt->len, slen, slen); } else { - for (p = sn->s; p < sn->end; ) { - len = enc_len(env->enc, *p); - if (len == 1 && ONIGENC_IS_MBC_CASE_AMBIG(env->enc, p)) { - break; - } - p += len; - } + OnigDistance max; - plen = p - sn->s; - if (plen > slen / 5) { - concat_opt_exact_info_str(&opt->exb, sn->s, p, is_raw, env->enc); - concat_opt_exact_info_str(&opt->exm, p, sn->end, is_raw, env->enc); - opt->exm.ignore_case = 1; - if (opt->exm.len == sn->end - p) - opt->exm.reach_end = 1; + if (NSTRING_IS_DONT_GET_OPT_INFO(node)) { + int n = onigenc_strlen(env->enc, sn->s, sn->end); + max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n; + } + else { + concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, + is_raw, env->enc); + opt->exb.ignore_case = 1; + + if (slen > 0) { + r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, + env->enc, env->case_fold_flag); + if (r != 0) break; + } - copy_mml(&(opt->exm.mmd), &(opt->exb.mmd)); - add_len_mml(&(opt->exm.mmd), plen); - } - else { - concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, - is_raw, env->enc); - opt->exb.ignore_case = 1; - } + max = slen; + } - if (slen > 0) { - if (p == sn->s) - add_char_amb_opt_map_info(&opt->map, *(sn->s), env->enc); - else - add_char_opt_map_info(&opt->map, *(sn->s)); - } + set_mml(&opt->len, slen, max); } - if (opt->exb.len == slen) - opt->exb.reach_end = 1; - - set_mml(&opt->len, slen, slen); + if ((OnigDistance )opt->exb.len == slen) + opt->exb.reach_end = 1; } break; - case N_CCLASS: + case NT_CCLASS: { - int i, z, len, found, mb_found; - CClassNode* cc = &(NCCLASS(node)); - - /* no need to check ignore case. (setted in setup_tree()) */ - found = mb_found = 0; - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = BITSET_AT(cc->bs, i); - if ((z && !cc->not) || (!z && cc->not)) { - found = 1; - add_char_opt_map_info(&opt->map, i); - } - } + int i, z; + CClassNode* cc = NCCLASS(node); - if (! ONIGENC_IS_SINGLEBYTE(env->enc)) { - if (! IS_NULL(cc->mbuf) || - (cc->not != 0 && found != 0)) { - for (i = 0; i < SINGLE_BYTE_SIZE; i++) { - z = ONIGENC_IS_MBC_HEAD(env->enc, i); - if (z) { - mb_found = 1; - add_char_opt_map_info(&opt->map, i); - } - } - } - } + /* no need to check ignore case. (set in setup_tree()) */ - if (mb_found) { - len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + + set_mml(&opt->len, min, max); } - else if (found) { - len = 1; - set_mml(&opt->len, 1, len); + else { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + z = BITSET_AT(cc->bs, i); + if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + set_mml(&opt->len, 1, 1); } } break; - case N_CTYPE: + case NT_CTYPE: { - int c; - int len, min, max; - - min = ONIGENC_MBC_MAXLEN_DIST(env->enc); - max = 0; - -#define IS_WORD_HEAD_BYTE(enc,b) \ - (ONIGENC_IS_MBC_ASCII(&b) ? ONIGENC_IS_CODE_WORD(enc,((OnigCodePoint )b)) \ - : ONIGENC_IS_MBC_HEAD(enc,b)) - - switch (NCTYPE(node).type) { - case CTYPE_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; - - case CTYPE_NOT_WORD: - for (c = 0; c < SINGLE_BYTE_SIZE; c++) { - if (! IS_WORD_HEAD_BYTE(env->enc, c)) { - add_char_opt_map_info(&opt->map, c); - len = enc_len(env->enc, c); - if (len < min) min = len; - if (len > max) max = len; - } - } - break; + int i, min, max; + int maxcode; + + max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + + if (max == 1) { + min = 1; + + maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE; + switch (NCTYPE(node)->ctype) { + case ONIGENC_CTYPE_WORD: + if (NCTYPE(node)->not != 0) { + for (i = 0; i < SINGLE_BYTE_SIZE; i++) { + if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + } + else { + for (i = 0; i < maxcode; i++) { + if (ONIGENC_IS_CODE_WORD(env->enc, i)) { + add_char_opt_map_info(&opt->map, (UChar )i, env->enc); + } + } + } + break; + } + } + else { + min = ONIGENC_MBC_MINLEN(env->enc); } - set_mml(&opt->len, min, max); } break; - case N_ANYCHAR: + case NT_CANY: { - OnigDistance len = ONIGENC_MBC_MAXLEN_DIST(env->enc); - set_mml(&opt->len, 1, len); + OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); + OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); + set_mml(&opt->len, min, max); } break; - case N_ANCHOR: - switch (NANCHOR(node).type) { + case NT_ANCHOR: + switch (NANCHOR(node)->type) { case ANCHOR_BEGIN_BUF: case ANCHOR_BEGIN_POSITION: case ANCHOR_BEGIN_LINE: case ANCHOR_END_BUF: case ANCHOR_SEMI_END_BUF: case ANCHOR_END_LINE: - add_opt_anc_info(&opt->anc, NANCHOR(node).type); + case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */ + case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */ + add_opt_anc_info(&opt->anc, NANCHOR(node)->type); break; case ANCHOR_PREC_READ: { - NodeOptInfo nopt; + NodeOptInfo nopt; - r = optimize_node_left(NANCHOR(node).target, &nopt, env); - if (r == 0) { - if (nopt.exb.len > 0) - copy_opt_exact_info(&opt->expr, &nopt.exb); - else if (nopt.exm.len > 0) - copy_opt_exact_info(&opt->expr, &nopt.exm); + r = optimize_node_left(NANCHOR(node)->target, &nopt, env); + if (r == 0) { + if (nopt.exb.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exb); + else if (nopt.exm.len > 0) + copy_opt_exact_info(&opt->expr, &nopt.exm); - opt->expr.reach_end = 0; + opt->expr.reach_end = 0; - if (nopt.map.value > 0) - copy_opt_map_info(&opt->map, &nopt.map); - } + if (nopt.map.value > 0) + copy_opt_map_info(&opt->map, &nopt.map); + } } break; - case ANCHOR_PREC_READ_NOT: - case ANCHOR_LOOK_BEHIND: /* Sorry, I can't make use of it. */ case ANCHOR_LOOK_BEHIND_NOT: break; } break; - case N_BACKREF: + case NT_BREF: { int i; int* backs; OnigDistance min, max, tmin, tmax; Node** nodes = SCANENV_MEM_NODES(env->scan_env); - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) { - set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); - break; + set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); + break; } backs = BACKREFS_P(br); r = get_min_match_length(nodes[backs[0]], &min, env->scan_env); @@ -4076,125 +5108,131 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) r = get_max_match_length(nodes[backs[0]], &max, env->scan_env); if (r != 0) break; for (i = 1; i < br->back_num; i++) { - r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); - if (r != 0) break; - r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); - if (r != 0) break; - if (min > tmin) min = tmin; - if (max < tmax) max = tmax; + r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); + if (r != 0) break; + r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); + if (r != 0) break; + if (min > tmin) min = tmin; + if (max < tmax) max = tmax; } if (r == 0) set_mml(&opt->len, min, max); } break; #ifdef USE_SUBEXP_CALL - case N_CALL: - if (IS_CALL_RECURSION(&(NCALL(node)))) + case NT_CALL: + if (IS_CALL_RECURSION(NCALL(node))) set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); else { OnigOptionType save = env->options; - env->options = NEFFECT(NCALL(node).target).option; - r = optimize_node_left(NCALL(node).target, opt, env); + env->options = NENCLOSE(NCALL(node)->target)->option; + r = optimize_node_left(NCALL(node)->target, opt, env); env->options = save; } break; #endif - case N_QUALIFIER: + case NT_QTFR: { int i; OnigDistance min, max; NodeOptInfo nopt; - QualifierNode* qn = &(NQUALIFIER(node)); + QtfrNode* qn = NQTFR(node); r = optimize_node_left(qn->target, &nopt, env); if (r) break; if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { - if (env->mmd.max == 0 && - NTYPE(qn->target) == N_ANYCHAR && qn->greedy) { - if (IS_POSIXLINE(env->options)) - add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_PL); - else - add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); - } + if (env->mmd.max == 0 && + NTYPE(qn->target) == NT_CANY && qn->greedy) { + if (IS_MULTILINE(env->options)) + /* implicit anchor: /.*a/ ==> /\A.*a/ */ + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML); + else + add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); + } } else { - if (qn->lower > 0) { - copy_node_opt_info(opt, &nopt); - if (nopt.exb.len > 0) { - if (nopt.exb.reach_end) { - for (i = 2; i < qn->lower && - ! is_full_opt_exact_info(&opt->exb); i++) { - concat_opt_exact_info(&opt->exb, &nopt.exb); - } - if (i < qn->lower) { - opt->exb.reach_end = 0; - } - } - } - - if (qn->lower != qn->upper) { - opt->exb.reach_end = 0; - opt->exm.reach_end = 0; - } - if (qn->lower > 1) - opt->exm.reach_end = 0; - } + if (qn->lower > 0) { + copy_node_opt_info(opt, &nopt); + if (nopt.exb.len > 0) { + if (nopt.exb.reach_end) { + for (i = 2; i <= qn->lower && + ! is_full_opt_exact_info(&opt->exb); i++) { + concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc); + } + if (i < qn->lower) { + opt->exb.reach_end = 0; + } + } + } + + if (qn->lower != qn->upper) { + opt->exb.reach_end = 0; + opt->exm.reach_end = 0; + } + if (qn->lower > 1) + opt->exm.reach_end = 0; + } } min = distance_multiply(nopt.len.min, qn->lower); if (IS_REPEAT_INFINITE(qn->upper)) - max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0); + max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0); else - max = distance_multiply(nopt.len.max, qn->upper); + max = distance_multiply(nopt.len.max, qn->upper); set_mml(&opt->len, min, max); } break; - case N_EFFECT: + case NT_ENCLOSE: { - EffectNode* en = &(NEFFECT(node)); + EncloseNode* en = NENCLOSE(node); switch (en->type) { - case EFFECT_OPTION: - { - OnigOptionType save = env->options; + case ENCLOSE_OPTION: + { + OnigOptionType save = env->options; - env->options = en->option; - r = optimize_node_left(en->target, opt, env); - env->options = save; - } - break; + env->options = en->option; + r = optimize_node_left(en->target, opt, env); + env->options = save; + } + break; - case EFFECT_MEMORY: + case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL - en->opt_count++; - if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { - OnigDistance min, max; - - min = 0; - max = ONIG_INFINITE_DISTANCE; - if (IS_EFFECT_MIN_FIXED(en)) min = en->min_len; - if (IS_EFFECT_MAX_FIXED(en)) max = en->max_len; - set_mml(&opt->len, min, max); - } - else + en->opt_count++; + if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { + OnigDistance min, max; + + min = 0; + max = ONIG_INFINITE_DISTANCE; + if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len; + if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len; + set_mml(&opt->len, min, max); + } + else #endif - { - r = optimize_node_left(en->target, opt, env); + { + r = optimize_node_left(en->target, opt, env); - if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) { - if (BIT_STATUS_AT(env->backrefed_status, en->regnum)) - remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK); - } - } - break; + if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) { + if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum)) + remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK); + } + } + break; - case EFFECT_STOP_BACKTRACK: - r = optimize_node_left(en->target, opt, env); - break; + case ENCLOSE_STOP_BACKTRACK: + case ENCLOSE_CONDITION: + r = optimize_node_left(en->target, opt, env); + break; + + case ENCLOSE_ABSENT: + set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); + break; } } break; @@ -4202,7 +5240,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) default: #ifdef ONIG_DEBUG fprintf(stderr, "optimize_node_left: undefined node type %d\n", - NTYPE(node)); + NTYPE(node)); #endif r = ONIGERR_TYPE_BUG; break; @@ -4214,53 +5252,49 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) static int set_optimize_exact_info(regex_t* reg, OptExactInfo* e) { - int r; + int allow_reverse; if (e->len == 0) return 0; - reg->exact = onig_strdup(e->s, e->s + e->len); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - + reg->exact = (UChar* )xmalloc(e->len); + CHECK_NULL_RETURN_MEMERR(reg->exact); + xmemcpy(reg->exact, e->s, e->len); reg->exact_end = reg->exact + e->len; - - if (e->ignore_case) { - UChar buf[ONIGENC_MBC_TO_LOWER_MAXLEN]; - int len, low_len, i, j, alloc_size; - - alloc_size = e->len; - i = j = 0; - while (i < e->len) { - low_len = ONIGENC_MBC_TO_LOWER(reg->enc, &(e->s[i]), buf); - len = enc_len(reg->enc, e->s[i]); - if (low_len > alloc_size - i) { - reg->exact = xrealloc(reg->exact, alloc_size * 2); - CHECK_NULL_RETURN_VAL(reg->exact, ONIGERR_MEMORY); - alloc_size *= 2; - } - xmemcpy(&(reg->exact[j]), buf, low_len); - i += len; - j += low_len; + allow_reverse = + ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); + + if (e->ignore_case > 0) { + if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { + int orig_len = e->len; + e->len = set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, 1); + if (e->len >= 3) { + reg->exact_end = reg->exact + e->len; + reg->optimize = (allow_reverse != 0 + ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC); + } + else { + /* Even if BM skip table can't be built (e.g., pattern starts with + 's' or 'k' which have multi-byte case fold variants), we should + still use EXACT_IC optimization with the original pattern. + Without this fallback, patterns like /slackware/i have no + optimization at all, causing severe performance regression + especially with non-ASCII strings. See [Bug #21824] */ + e->len = orig_len; /* Restore original length for EXACT_IC */ + reg->optimize = ONIG_OPTIMIZE_EXACT_IC; + } + } + else { + reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } - reg->exact_end = reg->exact + j; - reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } else { - int allow_reverse; - - if (e->anc.left_anchor & ANCHOR_BEGIN_LINE) - allow_reverse = 1; - else - allow_reverse = - ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); - if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg->enc, 0, - reg->map, &(reg->int_map)); - if (r) return r; - + set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, 0); reg->optimize = (allow_reverse != 0 - ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); + ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); } else { reg->optimize = ONIG_OPTIMIZE_EXACT; @@ -4271,7 +5305,7 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) reg->dmax = e->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { - reg->threshold_len = reg->dmin + (reg->exact_end - reg->exact); + reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact)); } return 0; @@ -4290,7 +5324,7 @@ set_optimize_map_info(regex_t* reg, OptMapInfo* m) reg->dmax = m->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { - reg->threshold_len = reg->dmin + 1; + reg->threshold_len = (int )(reg->dmin + 1); } } @@ -4301,7 +5335,7 @@ set_sub_anchor(regex_t* reg, OptAncInfo* anc) reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE; } -#ifdef ONIG_DEBUG +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) static void print_optimize_info(FILE* f, regex_t* reg); #endif @@ -4313,8 +5347,9 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) NodeOptInfo opt; OptEnv env; - env.enc = reg->enc; - env.options = reg->options; + env.enc = reg->enc; + env.options = reg->options; + env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; clear_mml(&env.mmd); @@ -4322,9 +5357,14 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) if (r) return r; reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | - ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_PL); + ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML | + ANCHOR_LOOK_BEHIND); + + if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0) + reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML; - reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF); + reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF | + ANCHOR_PREC_READ_NOT); if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) { reg->anchor_dmin = opt.len.min; @@ -4332,9 +5372,9 @@ set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) } if (opt.exb.len > 0 || opt.exm.len > 0) { - select_opt_exact_info(&opt.exb, &opt.exm); + select_opt_exact_info(reg->enc, &opt.exb, &opt.exm); if (opt.map.value > 0 && - comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { + comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { goto set_map; } else { @@ -4369,28 +5409,60 @@ clear_optimize_info(regex_t* reg) reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->threshold_len = 0; - if (IS_NOT_NULL(reg->exact)) { - xfree(reg->exact); - reg->exact = (UChar* )NULL; - } + xfree(reg->exact); + reg->exact = (UChar* )NULL; } #ifdef ONIG_DEBUG +static void print_enc_string(FILE* fp, OnigEncoding enc, + const UChar *s, const UChar *end) +{ + fprintf(fp, "\nPATTERN: /"); + + if (ONIGENC_MBC_MINLEN(enc) > 1) { + const UChar *p; + OnigCodePoint code; + + p = s; + while (p < end) { + code = ONIGENC_MBC_TO_CODE(enc, p, end); + if (code >= 0x80) { + fprintf(fp, " 0x%04x ", (int )code); + } + else { + fputc((int )code, fp); + } + + p += enclen(enc, p, end); + } + } + else { + while (s < end) { + fputc((int )*s, fp); + s++; + } + } + + fprintf(fp, "/ (%s)\n", enc->name); +} +#endif /* ONIG_DEBUG */ + +#if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) static void print_distance_range(FILE* f, OnigDistance a, OnigDistance b) { if (a == ONIG_INFINITE_DISTANCE) fputs("inf", f); else - fprintf(f, "(%u)", a); + fprintf(f, "(%"PRIuPTR")", a); fputs("-", f); if (b == ONIG_INFINITE_DISTANCE) fputs("inf", f); else - fprintf(f, "(%u)", b); + fprintf(f, "(%"PRIuPTR")", b); } static void @@ -4434,9 +5506,9 @@ print_anchor(FILE* f, int anchor) q = 1; fprintf(f, "anychar-star"); } - if (anchor & ANCHOR_ANYCHAR_STAR_PL) { + if (anchor & ANCHOR_ANYCHAR_STAR_ML) { if (q) fprintf(f, ", "); - fprintf(f, "anychar-star-pl"); + fprintf(f, "anychar-star-ml"); } fprintf(f, "]"); @@ -4445,8 +5517,9 @@ print_anchor(FILE* f, int anchor) static void print_optimize_info(FILE* f, regex_t* reg) { - static char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", - "EXACT_IC", "MAP" }; + static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", + "EXACT_IC", "MAP", + "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); @@ -4466,40 +5539,49 @@ print_optimize_info(FILE* f, regex_t* reg) for (p = reg->exact; p < reg->exact_end; p++) { fputc(*p, f); } - fprintf(f, "]: length: %d\n", (reg->exact_end - reg->exact)); + fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { - int i, n = 0; + int c, i, n = 0; + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { + c = 0; fputc('[', f); - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) - if (reg->map[i] && enc_len(reg->enc, i) == 1 && - ONIGENC_IS_CODE_PRINT(reg->enc, i)) - fputc(i, f); + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { + if (reg->map[i] != 0) { + if (c > 0) fputs(", ", f); + c++; + if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && + ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) + fputc(i, f); + else + fprintf(f, "%d", i); + } + } fprintf(f, "]\n"); } } } -#endif /* ONIG_DEBUG */ +#endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */ -static void +extern void onig_free_body(regex_t* reg) { - if (IS_NOT_NULL(reg->p)) xfree(reg->p); - if (IS_NOT_NULL(reg->exact)) xfree(reg->exact); - if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map); - if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward); - if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); - if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain); + if (IS_NOT_NULL(reg)) { + xfree(reg->p); + xfree(reg->exact); + xfree(reg->repeat_range); + onig_free(reg->chain); #ifdef USE_NAMED_GROUP - onig_names_free(reg); + onig_names_free(reg); #endif + } } extern void @@ -4511,152 +5593,152 @@ onig_free(regex_t* reg) } } -#define REGEX_TRANSFER(to,from) do {\ - (to)->state = ONIG_STATE_MODIFY;\ - onig_free_body(to);\ - xmemcpy(to, from, sizeof(regex_t));\ - xfree(from);\ -} while (0) - -static void -onig_transfer(regex_t* to, regex_t* from) +static void* +dup_copy(const void *ptr, size_t size) { - THREAD_ATOMIC_START; - REGEX_TRANSFER(to, from); - THREAD_ATOMIC_END; -} - -#define REGEX_CHAIN_HEAD(reg) do {\ - while (IS_NOT_NULL((reg)->chain)) {\ - (reg) = (reg)->chain;\ - }\ -} while (0) - -static void -onig_chain_link_add(regex_t* to, regex_t* add) -{ - THREAD_ATOMIC_START; - REGEX_CHAIN_HEAD(to); - to->chain = add; - THREAD_ATOMIC_END; -} - -extern void -onig_chain_reduce(regex_t* reg) -{ - regex_t *head, *prev; - - THREAD_ATOMIC_START; - prev = reg; - head = prev->chain; - if (IS_NOT_NULL(head)) { - reg->state = ONIG_STATE_MODIFY; - while (IS_NOT_NULL(head->chain)) { - prev = head; - head = head->chain; - } - prev->chain = (regex_t* )NULL; - REGEX_TRANSFER(reg, head); + void *newptr = xmalloc(size); + if (IS_NOT_NULL(newptr)) { + memcpy(newptr, ptr, size); } - THREAD_ATOMIC_END; + return newptr; } -#if 0 extern int -onig_clone(regex_t** to, regex_t* from) +onig_reg_copy(regex_t** nreg, regex_t* oreg) { - int r, size; - regex_t* reg; - - if (ONIG_STATE(from) == ONIG_STATE_NORMAL) { - from->state++; /* increment as search counter */ - if (IS_NOT_NULL(from->chain)) { - onig_chain_reduce(from); - from->state++; - } - } - else { - int n = 0; - while (ONIG_STATE(from) < ONIG_STATE_NORMAL) { - if (++n > THREAD_PASS_LIMIT_COUNT) - return ONIGERR_OVER_THREAD_PASS_LIMIT_COUNT; - THREAD_PASS; - } - from->state++; /* increment as search counter */ - } + if (IS_NOT_NULL(oreg)) { + regex_t *reg = *nreg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(reg)) return ONIGERR_MEMORY; - r = onig_alloc_init(®, ONIG_OPTION_NONE, from->enc, ONIG_SYNTAX_DEFAULT); - if (r != 0) { - from->state--; - return r; - } + *reg = *oreg; - xmemcpy(reg, from, sizeof(onig_t)); - reg->state = ONIG_STATE_NORMAL; - reg->chain = (regex_t* )NULL; +# define COPY_FAILED(mem, size) IS_NULL(reg->mem = dup_copy(reg->mem, size)) - if (from->p) { - reg->p = (UChar* )xmalloc(reg->alloc); - if (IS_NULL(reg->p)) goto mem_error; - xmemcpy(reg->p, from->p, reg->alloc); - } + if (IS_NOT_NULL(reg->exact)) { + size_t exact_size = reg->exact_end - reg->exact; + if (COPY_FAILED(exact, exact_size)) + goto err; + (reg)->exact_end = (reg)->exact + exact_size; + } - if (from->exact) { - reg->exact = (UChar* )xmalloc(from->exact_end - from->exact); - if (IS_NULL(reg->exact)) goto mem_error; - reg->exact_end = reg->exact + (from->exact_end - from->exact); - xmemcpy(reg->exact, from->exact, reg->exact_end - reg->exact); + if (IS_NOT_NULL(reg->p)) { + if (COPY_FAILED(p, reg->alloc)) + goto err_p; + } + if (IS_NOT_NULL(reg->repeat_range)) { + if (COPY_FAILED(repeat_range, reg->repeat_range_alloc * sizeof(OnigRepeatRange))) + goto err_repeat_range; + } + if (IS_NOT_NULL(reg->name_table)) { + if (onig_names_copy(reg, oreg)) + goto err_name_table; + } + if (IS_NOT_NULL(reg->chain)) { + if (onig_reg_copy(®->chain, reg->chain)) + goto err_chain; + } + return 0; +# undef COPY_FAILED + + err_chain: + onig_names_free(reg); + err_name_table: + xfree(reg->repeat_range); + err_repeat_range: + xfree(reg->p); + err_p: + xfree(reg->exact); + err: + xfree(reg); + return ONIGERR_MEMORY; } + return 0; +} - if (from->int_map) { - size = sizeof(int) * ONIG_CHAR_TABLE_SIZE; - reg->int_map = (int* )xmalloc(size); - if (IS_NULL(reg->int_map)) goto mem_error; - xmemcpy(reg->int_map, from->int_map, size); - } +#ifdef RUBY +size_t +onig_memsize(const regex_t *reg) +{ + size_t size = sizeof(regex_t); + if (IS_NULL(reg)) return 0; + if (IS_NOT_NULL(reg->p)) size += reg->alloc; + if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact; + if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange); + if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain); - if (from->int_map_backward) { - size = sizeof(int) * ONIG_CHAR_TABLE_SIZE; - reg->int_map_backward = (int* )xmalloc(size); - if (IS_NULL(reg->int_map_backward)) goto mem_error; - xmemcpy(reg->int_map_backward, from->int_map_backward, size); - } + return size; +} -#ifdef USE_NAMED_GROUP - reg->name_table = names_clone(from); /* names_clone is not implemented */ +size_t +onig_region_memsize(const OnigRegion *regs) +{ + size_t size = sizeof(*regs); + if (IS_NULL(regs)) return 0; + size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end)); + return size; +} #endif - from->state--; - *to = reg; - return 0; +#define REGEX_TRANSFER(to,from) do {\ + onig_free_body(to);\ + xmemcpy(to, from, sizeof(regex_t));\ + xfree(from);\ +} while (0) - mem_error: - from->state--; - return ONIGERR_MEMORY; +#if 0 +extern void +onig_transfer(regex_t* to, regex_t* from) +{ + REGEX_TRANSFER(to, from); } #endif -#ifdef ONIG_DEBUG -static void print_compiled_byte_code_list P_((FILE* f, regex_t* reg)); +#ifdef ONIG_DEBUG_COMPILE +static void print_compiled_byte_code_list(FILE* f, regex_t* reg); #endif #ifdef ONIG_DEBUG_PARSE_TREE -static void print_tree P_((FILE* f, Node* node)); +static void print_tree(FILE* f, Node* node); #endif +#ifdef RUBY extern int -onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, - OnigErrorInfo* einfo) +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +{ + return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0); +} +#endif + +#ifdef RUBY +extern int +onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo, const char *sourcefile, int sourceline) +#else +extern int +onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, + OnigErrorInfo* einfo) +#endif { #define COMPILE_INIT_SIZE 20 - int r, init_size; + int r; + OnigDistance init_size; Node* root; - ScanEnv scan_env; + ScanEnv scan_env = {0}; #ifdef USE_SUBEXP_CALL UnsetAddrList uslist; #endif - reg->state = ONIG_STATE_COMPILING; + if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; + +#ifdef RUBY + scan_env.sourcefile = sourcefile; + scan_env.sourceline = sourceline; +#endif + +#ifdef ONIG_DEBUG + print_enc_string(stderr, reg->enc, pattern, pattern_end); +#endif if (reg->alloc == 0) { init_size = (pattern_end - pattern) * 2; @@ -4672,10 +5754,20 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, reg->num_null_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (OnigRepeatRange* )NULL; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + reg->num_comb_exp_check = 0; +#endif r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; +#ifdef ONIG_DEBUG_PARSE_TREE +# if 0 + fprintf(stderr, "ORIGINAL PARSE TREE:\n"); + print_tree(stderr, root); +# endif +#endif + #ifdef USE_NAMED_GROUP /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && @@ -4690,10 +5782,6 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, } #endif -#ifdef ONIG_DEBUG_PARSE_TREE - print_tree(stderr, root); -#endif - #ifdef USE_SUBEXP_CALL if (scan_env.num_call > 0) { r = unset_addr_list_init(&uslist, scan_env.num_call); @@ -4715,6 +5803,10 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, r = setup_tree(root, reg, 0, &scan_env); if (r != 0) goto err_unset; +#ifdef ONIG_DEBUG_PARSE_TREE + print_tree(stderr, root); +#endif + reg->capture_history = scan_env.capture_history; reg->bt_mem_start = scan_env.bt_mem_start; reg->bt_mem_start |= reg->capture_history; @@ -4725,6 +5817,33 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, reg->bt_mem_end |= reg->capture_history; } +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (scan_env.backrefed_mem == 0 +# ifdef USE_SUBEXP_CALL + || scan_env.num_call == 0 +# endif + ) { + setup_comb_exp_check(root, 0, &scan_env); +# ifdef USE_SUBEXP_CALL + if (scan_env.has_recursion != 0) { + scan_env.num_comb_exp_check = 0; + } + else +# endif + if (scan_env.comb_exp_max_regnum > 0) { + int i; + for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) { + if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) { + scan_env.num_comb_exp_check = 0; + break; + } + } + } + } + + reg->num_comb_exp_check = scan_env.num_comb_exp_check; +#endif + clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); @@ -4751,9 +5870,9 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { if (reg->bt_mem_start != 0) - reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; + reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else - reg->stack_pop_level = STACK_POP_LEVEL_FREE; + reg->stack_pop_level = STACK_POP_LEVEL_FREE; } } #ifdef USE_SUBEXP_CALL @@ -4764,14 +5883,14 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE -#ifdef USE_NAMED_GROUP +# ifdef USE_NAMED_GROUP onig_print_names(stderr, reg); -#endif +# endif print_compiled_byte_code_list(stderr, reg); #endif end: - reg->state = ONIG_STATE_NORMAL; + onig_reg_resize(reg); return r; err_unset: @@ -4783,50 +5902,44 @@ onig_compile(regex_t* reg, UChar* pattern, UChar* pattern_end, err: if (IS_NOT_NULL(scan_env.error)) { if (IS_NOT_NULL(einfo)) { + einfo->enc = scan_env.enc; einfo->par = scan_env.error; einfo->par_end = scan_env.error_end; } } - if (IS_NOT_NULL(root)) onig_node_free(root); - if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) - xfree(scan_env.mem_nodes_dynamic); - return r; -} - -extern int -onig_recompile(regex_t* reg, UChar* pattern, UChar* pattern_end, - OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, - OnigErrorInfo* einfo) -{ - int r; - regex_t *new_reg; + onig_node_free(root); + xfree(scan_env.mem_nodes_dynamic); - r = onig_new(&new_reg, pattern, pattern_end, option, enc, syntax, einfo); - if (r) return r; - if (ONIG_STATE(reg) == ONIG_STATE_NORMAL) { - onig_transfer(reg, new_reg); - } - else { - onig_chain_link_add(reg, new_reg); - } - return 0; + return r; } static int onig_inited = 0; extern int -onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, - OnigSyntaxType* syntax) +onig_reg_init(regex_t* reg, OnigOptionType option, + OnigCaseFoldType case_fold_flag, + OnigEncoding enc, const OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); + if (IS_NULL(reg)) + return ONIGERR_INVALID_ARGUMENT; + + (reg)->exact = (UChar* )NULL; + (reg)->chain = (regex_t* )NULL; + (reg)->p = (UChar* )NULL; + (reg)->name_table = (void* )NULL; + (reg)->repeat_range = (OnigRepeatRange* )NULL; + if (ONIGENC_IS_UNDEF(enc)) - return ONIGERR_DEFAULT_ENCODING_IS_NOT_SETTED; + return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET; - *reg = (regex_t* )xmalloc(sizeof(regex_t)); - if (IS_NULL(*reg)) return ONIGERR_MEMORY; + if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) + == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) { + return ONIGERR_INVALID_COMBINATION_OF_OPTIONS; + } if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) { option |= syntax->options; @@ -4835,83 +5948,201 @@ onig_alloc_init(regex_t** reg, OnigOptionType option, OnigEncoding enc, else option |= syntax->options; - (*reg)->state = ONIG_STATE_NORMAL; - (*reg)->enc = enc; - (*reg)->options = option; - (*reg)->syntax = syntax; - (*reg)->optimize = 0; - (*reg)->exact = (UChar* )NULL; - (*reg)->int_map = (int* )NULL; - (*reg)->int_map_backward = (int* )NULL; - (*reg)->chain = (regex_t* )NULL; - - (*reg)->p = (UChar* )NULL; - (*reg)->alloc = 0; - (*reg)->used = 0; - (*reg)->name_table = (void* )NULL; + (reg)->enc = enc; + (reg)->options = option; + (reg)->syntax = syntax; + (reg)->optimize = 0; + + (reg)->alloc = 0; + (reg)->used = 0; + + (reg)->case_fold_flag = case_fold_flag; + + (reg)->timelimit = 0; return 0; } extern int -onig_new(regex_t** reg, UChar* pattern, UChar* pattern_end, - OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, - OnigErrorInfo* einfo) +onig_new_without_alloc(regex_t* reg, const UChar* pattern, + const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, + const OnigSyntaxType* syntax, OnigErrorInfo* einfo) { int r; - if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; - - r = onig_alloc_init(reg, option, enc, syntax); + r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) return r; - r = onig_compile(*reg, pattern, pattern_end, einfo); + r = onig_compile(reg, pattern, pattern_end, einfo); + return r; +} + +extern int +onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, + OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, + OnigErrorInfo* einfo) +{ + *reg = (regex_t* )xmalloc(sizeof(regex_t)); + if (IS_NULL(*reg)) return ONIGERR_MEMORY; + + int r = onig_new_without_alloc(*reg, pattern, pattern_end, option, enc, syntax, einfo); if (r) { onig_free(*reg); *reg = NULL; } + return r; } extern int -onig_init() +onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED) +{ + return onig_init(); +} + +extern int +onig_init(void) { if (onig_inited != 0) return 0; onig_inited = 1; - THREAD_ATOMIC_START; +#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) + _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); +#endif onigenc_init(); - onigenc_set_default_caseconv_table((UChar* )0); + /* onigenc_set_default_caseconv_table((UChar* )0); */ #ifdef ONIG_DEBUG_STATISTICS onig_statistics_init(); #endif - THREAD_ATOMIC_END; return 0; } + +static OnigEndCallListItemType* EndCallTop; + +extern void onig_add_end_call(void (*func)(void)) +{ + OnigEndCallListItemType* item; + + item = (OnigEndCallListItemType* )xmalloc(sizeof(*item)); + if (item == 0) return ; + + item->next = EndCallTop; + item->func = func; + + EndCallTop = item; +} + +static void +exec_end_call_list(void) +{ + OnigEndCallListItemType* prev; + void (*func)(void); + + while (EndCallTop != 0) { + func = EndCallTop->func; + (*func)(); + + prev = EndCallTop; + EndCallTop = EndCallTop->next; + xfree(prev); + } +} + extern int -onig_end() +onig_end(void) { + exec_end_call_list(); + #ifdef ONIG_DEBUG_STATISTICS onig_print_statistics(stderr); #endif -#ifdef USE_RECYCLE_NODE - onig_free_node_list(); +#if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) + _CrtDumpMemoryLeaks(); #endif onig_inited = 0; + return 0; } +extern int +onig_is_in_code_range(const UChar* p, OnigCodePoint code) +{ + OnigCodePoint n, *data; + OnigCodePoint low, high, x; + + GET_CODE_POINT(n, p); + data = (OnigCodePoint* )p; + data++; + + for (low = 0, high = n; low < high; ) { + x = (low + high) >> 1; + if (code > data[x * 2 + 1]) + low = x + 1; + else + high = x; + } + + return ((low < n && code >= data[low * 2]) ? 1 : 0); +} + +extern int +onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc) +{ + int found; + + if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) { + if (IS_NULL(cc->mbuf)) { + found = 0; + } + else { + found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); + } + } + else { + found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); + } + + if (IS_NCCLASS_NOT(cc)) + return !found; + else + return found; +} + +extern int +onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) +{ + int len; + + if (ONIGENC_MBC_MINLEN(enc) > 1) { + len = 2; + } + else { + len = ONIGENC_CODE_TO_MBCLEN(enc, code); + } + return onig_is_code_in_cc_len(len, code, cc); +} + #ifdef ONIG_DEBUG +/* arguments type */ +# define ARG_SPECIAL -1 +# define ARG_NON 0 +# define ARG_RELADDR 1 +# define ARG_ABSADDR 2 +# define ARG_LENGTH 3 +# define ARG_MEMNUM 4 +# define ARG_OPTION 5 +# define ARG_STATE_CHECK 6 + OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, { OP_END, "end", ARG_NON }, @@ -4941,64 +6172,81 @@ OnigOpInfoType OnigOpInfo[] = { { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON }, { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, - { OP_WORD, "word", ARG_NON }, - { OP_NOT_WORD, "not-word", ARG_NON }, - { OP_WORD_SB, "word-sb", ARG_NON }, - { OP_WORD_MB, "word-mb", ARG_NON }, - { OP_WORD_BOUND, "word-bound", ARG_NON }, - { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, - { OP_WORD_BEGIN, "word-begin", ARG_NON }, - { OP_WORD_END, "word-end", ARG_NON }, - { OP_BEGIN_BUF, "begin-buf", ARG_NON }, - { OP_END_BUF, "end-buf", ARG_NON }, - { OP_BEGIN_LINE, "begin-line", ARG_NON }, - { OP_END_LINE, "end-line", ARG_NON }, - { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, - { OP_BEGIN_POSITION, "begin-position", ARG_NON }, - { OP_BACKREF1, "backref1", ARG_NON }, - { OP_BACKREF2, "backref2", ARG_NON }, - { OP_BACKREF3, "backref3", ARG_NON }, - { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_MEMNUM }, - { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic",ARG_SPECIAL }, - { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, - { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, - { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, - { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, - { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, - { OP_SET_OPTION, "set-option", ARG_OPTION }, - { OP_FAIL, "fail", ARG_NON }, - { OP_JUMP, "jump", ARG_RELADDR }, - { OP_PUSH, "push", ARG_RELADDR }, - { OP_POP, "pop", ARG_NON }, - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, - { OP_REPEAT, "repeat", ARG_SPECIAL }, - { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, - { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, - { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, - { OP_NULL_CHECK_START, "null-check-start",ARG_MEMNUM }, - { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, - { OP_PUSH_POS, "push-pos", ARG_NON }, - { OP_POP_POS, "pop-pos", ARG_NON }, - { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, - { OP_FAIL_POS, "fail-pos", ARG_NON }, - { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, - { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, - { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, + { OP_WORD, "word", ARG_NON }, + { OP_NOT_WORD, "not-word", ARG_NON }, + { OP_WORD_BOUND, "word-bound", ARG_NON }, + { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, + { OP_WORD_BEGIN, "word-begin", ARG_NON }, + { OP_WORD_END, "word-end", ARG_NON }, + { OP_ASCII_WORD, "ascii-word", ARG_NON }, + { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON }, + { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON }, + { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON }, + { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON }, + { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON }, + { OP_BEGIN_BUF, "begin-buf", ARG_NON }, + { OP_END_BUF, "end-buf", ARG_NON }, + { OP_BEGIN_LINE, "begin-line", ARG_NON }, + { OP_END_LINE, "end-line", ARG_NON }, + { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, + { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BACKREF1, "backref1", ARG_NON }, + { OP_BACKREF2, "backref2", ARG_NON }, + { OP_BACKREFN, "backrefn", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, + { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, + { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, + { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL }, + { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, + { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, + { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, + { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, + { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, + { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_KEEP, "keep", ARG_NON }, + { OP_FAIL, "fail", ARG_NON }, + { OP_JUMP, "jump", ARG_RELADDR }, + { OP_PUSH, "push", ARG_RELADDR }, + { OP_POP, "pop", ARG_NON }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, + { OP_REPEAT, "repeat", ARG_SPECIAL }, + { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, + { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, + { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, + { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM }, + { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, + { OP_PUSH_POS, "push-pos", ARG_NON }, + { OP_POP_POS, "pop-pos", ARG_NON }, + { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, + { OP_FAIL_POS, "fail-pos", ARG_NON }, + { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, + { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, + { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, - { OP_CALL, "call", ARG_ABSADDR }, - { OP_RETURN, "return", ARG_NON }, + { OP_PUSH_ABSENT_POS, "push-absent-pos", ARG_NON }, + { OP_ABSENT, "absent", ARG_RELADDR }, + { OP_ABSENT_END, "absent-end", ARG_NON }, + { OP_CALL, "call", ARG_ABSADDR }, + { OP_RETURN, "return", ARG_NON }, + { OP_CONDITION, "condition", ARG_SPECIAL }, + { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL }, + { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL }, + { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_ML_STAR, + "state-check-anychar-ml*", ARG_STATE_CHECK }, { -1, "", ARG_NON } }; -static char* +static const char* op2name(int opcode) { int i; @@ -5022,15 +6270,17 @@ op2arg_type(int opcode) return ARG_SPECIAL; } +# ifdef ONIG_DEBUG_PARSE_TREE static void Indent(FILE* f, int indent) { int i; for (i = 0; i < indent; i++) putc(' ', f); } +# endif /* ONIG_DEBUG_PARSE_TREE */ static void -p_string(FILE* f, int len, UChar* s) +p_string(FILE* f, ptrdiff_t len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } @@ -5046,12 +6296,14 @@ p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) } extern void -onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) +onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, + OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; LengthType len; MemNumType mem; + StateCheckNumType scn; OnigCodePoint code; UChar *q; @@ -5063,9 +6315,8 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) case ARG_NON: break; case ARG_RELADDR: - addr = *((RelAddrType* )bp); - bp += SIZE_RELADDR; - fprintf(f, ":(%d)", addr); + GET_RELADDR_INC(addr, bp); + fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); break; case ARG_ABSADDR: GET_ABSADDR_INC(addr, bp); @@ -5082,11 +6333,17 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case ARG_OPTION: { - OnigOptionType option = *((OnigOptionType* )bp); - bp += SIZE_OPTION; - fprintf(f, ":%d", option); + OnigOptionType option = *((OnigOptionType* )bp); + bp += SIZE_OPTION; + fprintf(f, ":%d", option); } break; + + case ARG_STATE_CHECK: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + fprintf(f, ":%d", scn); + break; } } else { @@ -5108,7 +6365,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) p_len_string(f, len, 1, bp); bp += len; break; - + case OP_EXACTMB2N1: p_string(f, 2, bp); bp += 2; break; case OP_EXACTMB2N2: @@ -5127,18 +6384,20 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) break; case OP_EXACTMBN: { - int mb_len; - - GET_LENGTH_INC(mb_len, bp); - GET_LENGTH_INC(len, bp); - fprintf(f, ":%d:%d:", mb_len, len); - n = len * mb_len; - while (n-- > 0) { fputc(*bp++, f); } + int mb_len; + + GET_LENGTH_INC(mb_len, bp); + GET_LENGTH_INC(len, bp); + fprintf(f, ":%d:%d:", mb_len, len); + n = len * mb_len; + while (n-- > 0) { fputc(*bp++, f); } } break; case OP_EXACT1_IC: - p_string(f, 1, bp++); + len = enclen(enc, bp, bpend); + p_string(f, len, bp); + bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); @@ -5162,9 +6421,9 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) case OP_CCLASS_MB_NOT: GET_LENGTH_INC(len, bp); q = bp; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +# ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); -#endif +# endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d", (int )code, len); @@ -5176,33 +6435,59 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) bp += SIZE_BITSET; GET_LENGTH_INC(len, bp); q = bp; -#ifndef PLATFORM_UNALIGNED_WORD_ACCESS +# ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); -#endif +# endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d:%d", n, (int )code, len); break; - case OP_BACKREF_MULTI: + case OP_BACKREFN_IC: + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + fprintf(f, ":%d", mem); + break; + case OP_BACKREF_MULTI_IC: + case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { - GET_MEMNUM_INC(mem, bp); - if (i > 0) fputs(", ", f); - fprintf(f, "%d", mem); + GET_MEMNUM_INC(mem, bp); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", mem); + } + break; + + case OP_BACKREF_WITH_LEVEL: + { + OnigOptionType option; + LengthType level; + + GET_OPTION_INC(option, bp); + fprintf(f, ":%d", option); + GET_LENGTH_INC(level, bp); + fprintf(f, ":%d", level); + + fputs(" ", f); + GET_LENGTH_INC(len, bp); + for (i = 0; i < len; i++) { + GET_MEMNUM_INC(mem, bp); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", mem); + } } break; case OP_REPEAT: case OP_REPEAT_NG: { - mem = *((MemNumType* )bp); - bp += SIZE_MEMNUM; - addr = *((RelAddrType* )bp); - bp += SIZE_RELADDR; - fprintf(f, ":%d:%d", mem, addr); + mem = *((MemNumType* )bp); + bp += SIZE_MEMNUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:%d", mem, addr); } break; @@ -5210,7 +6495,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) case OP_PUSH_IF_PEEK_NEXT: addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; - fprintf(f, ":(%d)", addr); + fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); p_string(f, 1, bp); bp += 1; break; @@ -5223,18 +6508,34 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp) case OP_PUSH_LOOK_BEHIND_NOT: GET_RELADDR_INC(addr, bp); GET_LENGTH_INC(len, bp); - fprintf(f, ":%d:(%d)", len, addr); + fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr); + break; + + case OP_STATE_CHECK_PUSH: + case OP_STATE_CHECK_PUSH_OR_JUMP: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr); + break; + + case OP_CONDITION: + GET_MEMNUM_INC(mem, bp); + GET_RELADDR_INC(addr, bp); + fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr); break; default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", - *--bp); + bp[-1]); } } fputs("]", f); if (nextp) *nextp = bp; } +# ifdef ONIG_DEBUG_COMPILE static void print_compiled_byte_code_list(FILE* f, regex_t* reg) { @@ -5242,27 +6543,27 @@ print_compiled_byte_code_list(FILE* f, regex_t* reg) UChar* bp = reg->p; UChar* end = reg->p + reg->used; - fprintf(f, "code length: %d\n", reg->used); + fprintf(f, "code length: %d", reg->used); - ncode = 0; + ncode = -1; while (bp < end) { ncode++; - if (bp > reg->p) { - if (ncode % 5 == 0) - fprintf(f, "\n"); - else - fputs(" ", f); - } - onig_print_compiled_byte_code(f, bp, &bp); + if (ncode % 5 == 0) + fprintf(f, "\n%ld:", bp - reg->p); + else + fprintf(f, " %ld:", bp - reg->p); + onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc); } fprintf(f, "\n"); } +# endif /* ONIG_DEBUG_COMPILE */ +# ifdef ONIG_DEBUG_PARSE_TREE static void print_indent_tree(FILE* f, Node* node, int indent) { - int i, type; + int i, type, container_p = 0; int add = 3; UChar* p; @@ -5274,71 +6575,73 @@ print_indent_tree(FILE* f, Node* node, int indent) type = NTYPE(node); switch (type) { - case N_LIST: - case N_ALT: - if (NTYPE(node) == N_LIST) - fprintf(f, "<list:%x>\n", (int )node); + case NT_LIST: + case NT_ALT: + if (NTYPE(node) == NT_LIST) + fprintf(f, "<list:%"PRIxPTR">\n", (intptr_t )node); else - fprintf(f, "<alt:%x>\n", (int )node); + fprintf(f, "<alt:%"PRIxPTR">\n", (intptr_t )node); - print_indent_tree(f, NCONS(node).left, indent + add); - while (IS_NOT_NULL(node = NCONS(node).right)) { + print_indent_tree(f, NCAR(node), indent + add); + while (IS_NOT_NULL(node = NCDR(node))) { if (NTYPE(node) != type) { - fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node)); - exit(0); + fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node)); + exit(0); } - print_indent_tree(f, NCONS(node).left, indent + add); + print_indent_tree(f, NCAR(node), indent + add); } break; - case N_STRING: - fprintf(f, "<string%s:%x>", - (NSTRING_IS_RAW(node) ? "-raw" : ""), (int )node); - for (p = NSTRING(node).s; p < NSTRING(node).end; p++) { + case NT_STR: + fprintf(f, "<string%s:%"PRIxPTR">", + (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node); + for (p = NSTR(node)->s; p < NSTR(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) - fputc(*p, f); + fputc(*p, f); else { - fprintf(f, " 0x%02x", *p); + fprintf(f, " 0x%02x", *p); } } break; - case N_CCLASS: - fprintf(f, "<cclass:%x>", (int )node); - if (NCCLASS(node).not) fputs(" not", f); - if (NCCLASS(node).mbuf) { - BBuf* bbuf = NCCLASS(node).mbuf; - for (i = 0; i < bbuf->used; i++) { - if (i > 0) fprintf(f, ","); - fprintf(f, "%0x", bbuf->p[i]); + case NT_CCLASS: + fprintf(f, "<cclass:%"PRIxPTR">", (intptr_t )node); + if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f); + if (NCCLASS(node)->mbuf) { + BBuf* bbuf = NCCLASS(node)->mbuf; + OnigCodePoint* data = (OnigCodePoint* )bbuf->p; + OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used); + fprintf(f, "%d", *data++); + for (; data < end; data+=2) { + fprintf(f, ","); + fprintf(f, "%04x-%04x", data[0], data[1]); } } -#if 0 - fprintf(f, "\n"); - Indent(f, indent); - for (i = 0; i < SINGLE_BYTE_SIZE; i++) - fputc((BITSET_AT(NCCLASS(node).bs, i) ? '1' : '0'), f); -#endif break; - case N_CTYPE: - fprintf(f, "<ctype:%x> ", (int )node); - switch (NCTYPE(node).type) { - case CTYPE_WORD: fputs("word", f); break; - case CTYPE_NOT_WORD: fputs("not word", f); break; + case NT_CTYPE: + fprintf(f, "<ctype:%"PRIxPTR"> ", (intptr_t )node); + switch (NCTYPE(node)->ctype) { + case ONIGENC_CTYPE_WORD: + if (NCTYPE(node)->not != 0) + fputs("not word", f); + else + fputs("word", f); + break; + default: fprintf(f, "ERROR: undefined ctype.\n"); exit(0); } break; - case N_ANYCHAR: - fprintf(f, "<anychar:%x>", (int )node); + case NT_CANY: + fprintf(f, "<anychar:%"PRIxPTR">", (intptr_t )node); break; - case N_ANCHOR: - fprintf(f, "<anchor:%x> ", (int )node); - switch (NANCHOR(node).type) { + case NT_ANCHOR: + fprintf(f, "<anchor:%"PRIxPTR"> ", (intptr_t )node); + switch (NANCHOR(node)->type) { case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; case ANCHOR_END_BUF: fputs("end buf", f); break; case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; @@ -5348,14 +6651,15 @@ print_indent_tree(FILE* f, Node* node, int indent) case ANCHOR_WORD_BOUND: fputs("word bound", f); break; case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; -#ifdef USE_WORD_BEGIN_END +# ifdef USE_WORD_BEGIN_END case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; case ANCHOR_WORD_END: fputs("word end", f); break; -#endif - case ANCHOR_PREC_READ: fputs("prec read", f); break; - case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); break; - case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); break; - case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); break; +# endif + case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break; + case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break; + case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break; + case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break; + case ANCHOR_KEEP: fputs("keep",f); break; default: fprintf(f, "ERROR: undefined anchor type.\n"); @@ -5363,55 +6667,60 @@ print_indent_tree(FILE* f, Node* node, int indent) } break; - case N_BACKREF: + case NT_BREF: { int* p; - BackrefNode* br = &(NBACKREF(node)); + BRefNode* br = NBREF(node); p = BACKREFS_P(br); - fprintf(f, "<backref:%x>", (int )node); + fprintf(f, "<backref:%"PRIxPTR">", (intptr_t )node); for (i = 0; i < br->back_num; i++) { - if (i > 0) fputs(", ", f); - fprintf(f, "%d", p[i]); + if (i > 0) fputs(", ", f); + fprintf(f, "%d", p[i]); } } break; -#ifdef USE_SUBEXP_CALL - case N_CALL: +# ifdef USE_SUBEXP_CALL + case NT_CALL: { - CallNode* cn = &(NCALL(node)); - fprintf(f, "<call:%x>", (int )node); + CallNode* cn = NCALL(node); + fprintf(f, "<call:%"PRIxPTR">", (intptr_t )node); p_string(f, cn->name_end - cn->name, cn->name); } break; -#endif +# endif - case N_QUALIFIER: - fprintf(f, "<qualifier:%x>{%d,%d}%s\n", (int )node, - NQUALIFIER(node).lower, NQUALIFIER(node).upper, - (NQUALIFIER(node).greedy ? "" : "?")); - print_indent_tree(f, NQUALIFIER(node).target, indent + add); + case NT_QTFR: + fprintf(f, "<quantifier:%"PRIxPTR">{%d,%d}%s\n", (intptr_t )node, + NQTFR(node)->lower, NQTFR(node)->upper, + (NQTFR(node)->greedy ? "" : "?")); + print_indent_tree(f, NQTFR(node)->target, indent + add); break; - case N_EFFECT: - fprintf(f, "<effect:%x> ", (int )node); - switch (NEFFECT(node).type) { - case EFFECT_OPTION: - fprintf(f, "option:%d\n", NEFFECT(node).option); - print_indent_tree(f, NEFFECT(node).target, indent + add); + case NT_ENCLOSE: + fprintf(f, "<enclose:%"PRIxPTR"> ", (intptr_t )node); + switch (NENCLOSE(node)->type) { + case ENCLOSE_OPTION: + fprintf(f, "option:%d", NENCLOSE(node)->option); break; - case EFFECT_MEMORY: - fprintf(f, "memory:%d", NEFFECT(node).regnum); + case ENCLOSE_MEMORY: + fprintf(f, "memory:%d", NENCLOSE(node)->regnum); break; - case EFFECT_STOP_BACKTRACK: + case ENCLOSE_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; + case ENCLOSE_CONDITION: + fprintf(f, "condition:%d", NENCLOSE(node)->regnum); + break; + case ENCLOSE_ABSENT: + fprintf(f, "absent"); + break; default: break; } fprintf(f, "\n"); - print_indent_tree(f, NEFFECT(node).target, indent + add); + print_indent_tree(f, NENCLOSE(node)->target, indent + add); break; default: @@ -5419,17 +6728,19 @@ print_indent_tree(FILE* f, Node* node, int indent) break; } - if (type != N_LIST && type != N_ALT && type != N_QUALIFIER && - type != N_EFFECT) + if (type != NT_LIST && type != NT_ALT && type != NT_QTFR && + type != NT_ENCLOSE) fprintf(f, "\n"); + + if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add); + fflush(f); } -#endif /* ONIG_DEBUG */ -#ifdef ONIG_DEBUG_PARSE_TREE static void print_tree(FILE* f, Node* node) { print_indent_tree(f, node, 0); } -#endif +# endif /* ONIG_DEBUG_PARSE_TREE */ +#endif /* ONIG_DEBUG */ |
