/********************************************************************** regcomp.c - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- * Copyright (c) 2002-2013 K.Kosako * Copyright (c) 2011-2016 K.Takata * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "regparse.h" OnigCaseFoldType OnigDefaultCaseFoldFlag = ONIGENC_CASE_FOLD_MIN; extern OnigCaseFoldType onig_get_default_case_fold_flag(void) { return OnigDefaultCaseFoldFlag; } extern int onig_set_default_case_fold_flag(OnigCaseFoldType case_fold_flag) { OnigDefaultCaseFoldFlag = case_fold_flag; return 0; } #ifndef PLATFORM_UNALIGNED_WORD_ACCESS static unsigned char PadBuf[WORD_ALIGNMENT_SIZE]; #endif #if 0 static UChar* str_dup(UChar* s, UChar* end) { ptrdiff_t len = end - s; if (len > 0) { UChar* r = (UChar* )xmalloc(len + 1); CHECK_NULL_RETURN(r); xmemcpy(r, s, len); r[len] = (UChar )0; return r; } else return NULL; } #endif static void swap_node(Node* a, Node* b) { Node c; c = *a; *a = *b; *b = c; if (NTYPE(a) == NT_STR) { StrNode* sn = NSTR(a); if (sn->capa == 0) { size_t len = sn->end - sn->s; sn->s = sn->buf; sn->end = sn->s + len; } } if (NTYPE(b) == NT_STR) { StrNode* sn = NSTR(b); if (sn->capa == 0) { size_t len = sn->end - sn->s; sn->s = sn->buf; sn->end = sn->s + len; } } } static OnigDistance distance_add(OnigDistance d1, OnigDistance d2) { if (d1 == ONIG_INFINITE_DISTANCE || d2 == ONIG_INFINITE_DISTANCE) return ONIG_INFINITE_DISTANCE; else { if (d1 <= ONIG_INFINITE_DISTANCE - d2) return d1 + d2; else return ONIG_INFINITE_DISTANCE; } } static OnigDistance distance_multiply(OnigDistance d, int m) { if (m == 0) return 0; if (d < ONIG_INFINITE_DISTANCE / m) return d * m; else return ONIG_INFINITE_DISTANCE; } static int bitset_is_empty(BitSetRef bs) { int i; for (i = 0; i < BITSET_SIZE; i++) { if (bs[i] != 0) return 0; } return 1; } #ifdef ONIG_DEBUG static int bitset_on_num(BitSetRef bs) { int i, n; n = 0; for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (BITSET_AT(bs, i)) n++; } return n; } #endif extern int onig_bbuf_init(BBuf* buf, OnigDistance size) { if (size <= 0) { size = 0; buf->p = NULL; } else { buf->p = (UChar* )xmalloc(size); if (IS_NULL(buf->p)) return(ONIGERR_MEMORY); } buf->alloc = (unsigned int )size; buf->used = 0; return 0; } #ifdef USE_SUBEXP_CALL static int unset_addr_list_init(UnsetAddrList* uslist, int size) { UnsetAddr* p; p = (UnsetAddr* )xmalloc(sizeof(UnsetAddr)* size); CHECK_NULL_RETURN_MEMERR(p); uslist->num = 0; uslist->alloc = size; uslist->us = p; return 0; } static void unset_addr_list_end(UnsetAddrList* uslist) { if (IS_NOT_NULL(uslist->us)) xfree(uslist->us); } static int unset_addr_list_add(UnsetAddrList* uslist, int offset, struct _Node* node) { UnsetAddr* p; int size; if (uslist->num >= uslist->alloc) { size = uslist->alloc * 2; p = (UnsetAddr* )xrealloc(uslist->us, sizeof(UnsetAddr) * size); CHECK_NULL_RETURN_MEMERR(p); uslist->alloc = size; uslist->us = p; } uslist->us[uslist->num].offset = offset; uslist->us[uslist->num].target = node; uslist->num++; return 0; } #endif /* USE_SUBEXP_CALL */ static int add_opcode(regex_t* reg, int opcode) { BBUF_ADD1(reg, opcode); return 0; } #ifdef USE_COMBINATION_EXPLOSION_CHECK static int add_state_check_num(regex_t* reg, int num) { StateCheckNumType n = (StateCheckNumType )num; BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM); return 0; } #endif static int add_rel_addr(regex_t* reg, int addr) { RelAddrType ra = (RelAddrType )addr; BBUF_ADD(reg, &ra, SIZE_RELADDR); return 0; } static int add_abs_addr(regex_t* reg, int addr) { AbsAddrType ra = (AbsAddrType )addr; BBUF_ADD(reg, &ra, SIZE_ABSADDR); return 0; } static int add_length(regex_t* reg, OnigDistance len) { LengthType l = (LengthType )len; BBUF_ADD(reg, &l, SIZE_LENGTH); return 0; } static int add_mem_num(regex_t* reg, int num) { MemNumType n = (MemNumType )num; BBUF_ADD(reg, &n, SIZE_MEMNUM); return 0; } #if 0 static int add_pointer(regex_t* reg, void* addr) { PointerType ptr = (PointerType )addr; BBUF_ADD(reg, &ptr, SIZE_POINTER); return 0; } #endif static int add_option(regex_t* reg, OnigOptionType option) { BBUF_ADD(reg, &option, SIZE_OPTION); return 0; } static int add_opcode_rel_addr(regex_t* reg, int opcode, int addr) { int r; r = add_opcode(reg, opcode); if (r) return r; r = add_rel_addr(reg, addr); return r; } static int add_bytes(regex_t* reg, UChar* bytes, OnigDistance len) { BBUF_ADD(reg, bytes, len); return 0; } static int add_bitset(regex_t* reg, BitSetRef bs) { BBUF_ADD(reg, bs, SIZE_BITSET); return 0; } static int add_opcode_option(regex_t* reg, int opcode, OnigOptionType option) { int r; r = add_opcode(reg, opcode); if (r) return r; r = add_option(reg, option); return r; } static int compile_length_tree(Node* node, regex_t* reg); static int compile_tree(Node* node, regex_t* reg); #define IS_NEED_STR_LEN_OP_EXACT(op) \ ((op) == OP_EXACTN || (op) == OP_EXACTMB2N ||\ (op) == OP_EXACTMB3N || (op) == OP_EXACTMBN || (op) == OP_EXACTN_IC) static int select_str_opcode(int mb_len, OnigDistance byte_len, int ignore_case) { int op; OnigDistance str_len = (byte_len + mb_len - 1) / mb_len; if (ignore_case) { switch (str_len) { case 1: op = OP_EXACT1_IC; break; default: op = OP_EXACTN_IC; break; } } else { switch (mb_len) { case 1: switch (str_len) { case 1: op = OP_EXACT1; break; case 2: op = OP_EXACT2; break; case 3: op = OP_EXACT3; break; case 4: op = OP_EXACT4; break; case 5: op = OP_EXACT5; break; default: op = OP_EXACTN; break; } break; case 2: switch (str_len) { case 1: op = OP_EXACTMB2N1; break; case 2: op = OP_EXACTMB2N2; break; case 3: op = OP_EXACTMB2N3; break; default: op = OP_EXACTMB2N; break; } break; case 3: op = OP_EXACTMB3N; break; default: op = OP_EXACTMBN; break; } } return op; } static int compile_tree_empty_check(Node* node, regex_t* reg, int empty_info) { int r; int saved_num_null_check = reg->num_null_check; if (empty_info != 0) { r = add_opcode(reg, OP_NULL_CHECK_START); if (r) return r; r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */ if (r) return r; reg->num_null_check++; } r = compile_tree(node, reg); if (r) return r; if (empty_info != 0) { if (empty_info == NQ_TARGET_IS_EMPTY) r = add_opcode(reg, OP_NULL_CHECK_END); else if (empty_info == NQ_TARGET_IS_EMPTY_MEM) r = add_opcode(reg, OP_NULL_CHECK_END_MEMST); else if (empty_info == NQ_TARGET_IS_EMPTY_REC) r = add_opcode(reg, OP_NULL_CHECK_END_MEMST_PUSH); if (r) return r; r = add_mem_num(reg, saved_num_null_check); /* NULL CHECK ID */ } return r; } #ifdef USE_SUBEXP_CALL static int compile_call(CallNode* node, regex_t* reg) { int r; r = add_opcode(reg, OP_CALL); if (r) return r; r = unset_addr_list_add(node->unset_addr_list, BBUF_GET_OFFSET_POS(reg), node->target); if (r) return r; r = add_abs_addr(reg, 0 /*dummy addr.*/); return r; } #endif static int compile_tree_n_times(Node* node, int n, regex_t* reg) { int i, r; for (i = 0; i < n; i++) { r = compile_tree(node, reg); if (r) return r; } return 0; } static int add_compile_string_length(UChar* s ARG_UNUSED, int mb_len, OnigDistance byte_len, regex_t* reg ARG_UNUSED, int ignore_case) { int len; int op = select_str_opcode(mb_len, byte_len, ignore_case); len = SIZE_OPCODE; if (op == OP_EXACTMBN) len += SIZE_LENGTH; if (IS_NEED_STR_LEN_OP_EXACT(op)) len += SIZE_LENGTH; len += (int )byte_len; return len; } static int add_compile_string(UChar* s, int mb_len, OnigDistance byte_len, regex_t* reg, int ignore_case) { int op = select_str_opcode(mb_len, byte_len, ignore_case); add_opcode(reg, op); if (op == OP_EXACTMBN) add_length(reg, mb_len); if (IS_NEED_STR_LEN_OP_EXACT(op)) { if (op == OP_EXACTN_IC) add_length(reg, byte_len); else add_length(reg, byte_len / mb_len); } add_bytes(reg, s, byte_len); return 0; } static int compile_length_string_node(Node* node, regex_t* reg) { int rlen, r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev; StrNode* sn; sn = NSTR(node); if (sn->end <= sn->s) return 0; ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; prev_len = enclen(enc, p, sn->end); p += prev_len; blen = prev_len; rlen = 0; for (; p < sn->end; ) { len = enclen(enc, p, sn->end); if (len == prev_len || ambig) { blen += len; } else { r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; prev = p; blen = len; prev_len = len; } p += len; } r = add_compile_string_length(prev, prev_len, blen, reg, ambig); rlen += r; return rlen; } static int compile_length_string_raw_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string_length(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); } static int compile_string_node(Node* node, regex_t* reg) { int r, len, prev_len, blen, ambig; OnigEncoding enc = reg->enc; UChar *p, *prev, *end; StrNode* sn; sn = NSTR(node); if (sn->end <= sn->s) return 0; end = sn->end; ambig = NSTRING_IS_AMBIG(node); p = prev = sn->s; prev_len = enclen(enc, p, end); p += prev_len; blen = prev_len; for (; p < end; ) { len = enclen(enc, p, end); if (len == prev_len || ambig) { blen += len; } else { r = add_compile_string(prev, prev_len, blen, reg, ambig); if (r) return r; prev = p; blen = len; prev_len = len; } p += len; } return add_compile_string(prev, prev_len, blen, reg, ambig); } static int compile_string_raw_node(StrNode* sn, regex_t* reg) { if (sn->end <= sn->s) return 0; return add_compile_string(sn->s, 1 /* sb */, sn->end - sn->s, reg, 0); } static int add_multi_byte_cclass(BBuf* mbuf, regex_t* reg) { #ifdef PLATFORM_UNALIGNED_WORD_ACCESS add_length(reg, mbuf->used); return add_bytes(reg, mbuf->p, mbuf->used); #else int r, pad_size; UChar* p = BBUF_GET_ADD_ADDRESS(reg) + SIZE_LENGTH; GET_ALIGNMENT_PAD_SIZE(p, pad_size); add_length(reg, mbuf->used + (WORD_ALIGNMENT_SIZE - 1)); if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); r = add_bytes(reg, mbuf->p, mbuf->used); /* padding for return value from compile_length_cclass_node() to be fix. */ pad_size = (WORD_ALIGNMENT_SIZE - 1) - pad_size; if (pad_size != 0) add_bytes(reg, PadBuf, pad_size); return r; #endif } static int compile_length_cclass_node(CClassNode* cc, regex_t* reg) { int len; if (IS_NULL(cc->mbuf)) { len = SIZE_OPCODE + SIZE_BITSET; } else { if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { len = SIZE_OPCODE; } else { len = SIZE_OPCODE + SIZE_BITSET; } #ifdef PLATFORM_UNALIGNED_WORD_ACCESS len += SIZE_LENGTH + cc->mbuf->used; #else len += SIZE_LENGTH + cc->mbuf->used + (WORD_ALIGNMENT_SIZE - 1); #endif } return len; } static int compile_cclass_node(CClassNode* cc, regex_t* reg) { int r; if (IS_NULL(cc->mbuf)) { if (IS_NCCLASS_NOT(cc)) add_opcode(reg, OP_CCLASS_NOT); else add_opcode(reg, OP_CCLASS); r = add_bitset(reg, cc->bs); } else { if (ONIGENC_MBC_MINLEN(reg->enc) > 1 || bitset_is_empty(cc->bs)) { if (IS_NCCLASS_NOT(cc)) add_opcode(reg, OP_CCLASS_MB_NOT); else add_opcode(reg, OP_CCLASS_MB); r = add_multi_byte_cclass(cc->mbuf, reg); } else { if (IS_NCCLASS_NOT(cc)) add_opcode(reg, OP_CCLASS_MIX_NOT); else add_opcode(reg, OP_CCLASS_MIX); r = add_bitset(reg, cc->bs); if (r) return r; r = add_multi_byte_cclass(cc->mbuf, reg); } } return r; } static int entry_repeat_range(regex_t* reg, int id, int lower, int upper) { #define REPEAT_RANGE_ALLOC 4 OnigRepeatRange* p; if (reg->repeat_range_alloc == 0) { p = (OnigRepeatRange* )xmalloc(sizeof(OnigRepeatRange) * REPEAT_RANGE_ALLOC); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = REPEAT_RANGE_ALLOC; } else if (reg->repeat_range_alloc <= id) { int n; n = reg->repeat_range_alloc + REPEAT_RANGE_ALLOC; p = (OnigRepeatRange* )xrealloc(reg->repeat_range, sizeof(OnigRepeatRange) * n); CHECK_NULL_RETURN_MEMERR(p); reg->repeat_range = p; reg->repeat_range_alloc = n; } else { p = reg->repeat_range; } p[id].lower = lower; p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); return 0; } static int compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info, regex_t* reg) { int r; int num_repeat = reg->num_repeat; r = add_opcode(reg, qn->greedy ? OP_REPEAT : OP_REPEAT_NG); if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ reg->num_repeat++; if (r) return r; r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC); if (r) return r; r = entry_repeat_range(reg, num_repeat, qn->lower, qn->upper); if (r) return r; r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; if ( #ifdef USE_SUBEXP_CALL reg->num_call > 0 || #endif IS_QUANTIFIER_IN_REPEAT(qn)) { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC_SG : OP_REPEAT_INC_NG_SG); } else { r = add_opcode(reg, qn->greedy ? OP_REPEAT_INC : OP_REPEAT_INC_NG); } if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ return r; } static int is_anychar_star_quantifier(QtfrNode* qn) { if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && NTYPE(qn->target) == NT_CANY) return 1; else return 0; } #define QUANTIFIER_EXPAND_LIMIT_SIZE 50 #define CKN_ON (ckn > 0) #ifdef USE_COMBINATION_EXPLOSION_CHECK static int compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) { int len, mod_tlen, cklen; int ckn; int infinite = IS_REPEAT_INFINITE(qn->upper); int empty_info = qn->target_empty_info; int tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0); /* anychar repeat */ if (NTYPE(qn->target) == NT_CANY) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; else return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; } } if (empty_info != 0) mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); else mod_tlen = tlen; if (infinite && qn->lower <= 1) { if (qn->greedy) { if (qn->lower == 1) len = SIZE_OP_JUMP; else len = 0; len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP; } else { if (qn->lower == 0) len = SIZE_OP_JUMP; else len = 0; len += mod_tlen + SIZE_OP_PUSH + cklen; } } else if (qn->upper == 0) { if (qn->is_refered != 0) /* /(?..){0}/ */ len = SIZE_OP_JUMP + tlen; else len = 0; } else if (qn->upper == 1 && qn->greedy) { if (qn->lower == 0) { if (CKN_ON) { len = SIZE_OP_STATE_CHECK_PUSH + tlen; } else { len = SIZE_OP_PUSH + tlen; } } else { len = tlen; } } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen; } else { len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; if (CKN_ON) len += SIZE_OP_STATE_CHECK; } return len; } static int compile_quantifier_node(QtfrNode* qn, regex_t* reg) { int r, mod_tlen; int ckn; int infinite = IS_REPEAT_INFINITE(qn->upper); int empty_info = qn->target_empty_info; int tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); if (is_anychar_star_quantifier(qn)) { r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; if (IS_NOT_NULL(qn->next_head_exact) && !CKN_ON) { if (IS_MULTILINE(reg->options)) r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); else r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); if (r) return r; if (CKN_ON) { r = add_state_check_num(reg, ckn); if (r) return r; } return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); } else { if (IS_MULTILINE(reg->options)) { r = add_opcode(reg, (CKN_ON ? OP_STATE_CHECK_ANYCHAR_ML_STAR : OP_ANYCHAR_ML_STAR)); } else { r = add_opcode(reg, (CKN_ON ? OP_STATE_CHECK_ANYCHAR_STAR : OP_ANYCHAR_STAR)); } if (r) return r; if (CKN_ON) r = add_state_check_num(reg, ckn); return r; } } if (empty_info != 0) mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); else mod_tlen = tlen; if (infinite && qn->lower <= 1) { if (qn->greedy) { if (qn->lower == 1) { r = add_opcode_rel_addr(reg, OP_JUMP, (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)); if (r) return r; } if (CKN_ON) { r = add_opcode(reg, OP_STATE_CHECK_PUSH); if (r) return r; r = add_state_check_num(reg, ckn); if (r) return r; r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP); } else { r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); } if (r) return r; r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH))); } else { if (qn->lower == 0) { r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); if (r) return r; } r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; if (CKN_ON) { r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP); if (r) return r; r = add_state_check_num(reg, ckn); if (r) return r; r = add_rel_addr(reg, -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP)); } else r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); } } else if (qn->upper == 0) { if (qn->is_refered != 0) { /* /(?..){0}/ */ r = add_opcode_rel_addr(reg, OP_JUMP, tlen); if (r) return r; r = compile_tree(qn->target, reg); } else r = 0; } else if (qn->upper == 1 && qn->greedy) { if (qn->lower == 0) { if (CKN_ON) { r = add_opcode(reg, OP_STATE_CHECK_PUSH); if (r) return r; r = add_state_check_num(reg, ckn); if (r) return r; r = add_rel_addr(reg, tlen); } else { r = add_opcode_rel_addr(reg, OP_PUSH, tlen); } if (r) return r; } r = compile_tree(qn->target, reg); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ if (CKN_ON) { r = add_opcode(reg, OP_STATE_CHECK_PUSH); if (r) return r; r = add_state_check_num(reg, ckn); if (r) return r; r = add_rel_addr(reg, SIZE_OP_JUMP); } else { r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); } if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, tlen); if (r) return r; r = compile_tree(qn->target, reg); } else { r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); if (CKN_ON) { if (r) return r; r = add_opcode(reg, OP_STATE_CHECK); if (r) return r; r = add_state_check_num(reg, ckn); } } return r; } #else /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_quantifier_node(QtfrNode* qn, regex_t* reg) { int len, mod_tlen; int infinite = IS_REPEAT_INFINITE(qn->upper); int empty_info = qn->target_empty_info; int tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; /* anychar repeat */ if (NTYPE(qn->target) == NT_CANY) { if (qn->greedy && infinite) { if (IS_NOT_NULL(qn->next_head_exact)) return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower; else return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower; } } if (empty_info != 0) mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); else mod_tlen = tlen; if (infinite && (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { len = SIZE_OP_JUMP; } else { len = tlen * qn->lower; } if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) len += SIZE_OP_PUSH_OR_JUMP_EXACT1 + mod_tlen + SIZE_OP_JUMP; else #endif if (IS_NOT_NULL(qn->next_head_exact)) len += SIZE_OP_PUSH_IF_PEEK_NEXT + mod_tlen + SIZE_OP_JUMP; else len += SIZE_OP_PUSH + mod_tlen + SIZE_OP_JUMP; } else len += SIZE_OP_JUMP + mod_tlen + SIZE_OP_PUSH; } else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?..){0}/ */ len = SIZE_OP_JUMP + tlen; } else if (!infinite && qn->greedy && (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { len = tlen * qn->lower; len += (SIZE_OP_PUSH + tlen) * (qn->upper - qn->lower); } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ len = SIZE_OP_PUSH + SIZE_OP_JUMP + tlen; } else { len = SIZE_OP_REPEAT_INC + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; } return len; } static int compile_quantifier_node(QtfrNode* qn, regex_t* reg) { int i, r, mod_tlen; int infinite = IS_REPEAT_INFINITE(qn->upper); int empty_info = qn->target_empty_info; int tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; if (is_anychar_star_quantifier(qn)) { r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; if (IS_NOT_NULL(qn->next_head_exact)) { if (IS_MULTILINE(reg->options)) r = add_opcode(reg, OP_ANYCHAR_ML_STAR_PEEK_NEXT); else r = add_opcode(reg, OP_ANYCHAR_STAR_PEEK_NEXT); if (r) return r; return add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); } else { if (IS_MULTILINE(reg->options)) return add_opcode(reg, OP_ANYCHAR_ML_STAR); else return add_opcode(reg, OP_ANYCHAR_STAR); } } if (empty_info != 0) mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); else mod_tlen = tlen; if (infinite && (qn->lower <= 1 || tlen * qn->lower <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { if (qn->lower == 1 && tlen > QUANTIFIER_EXPAND_LIMIT_SIZE) { if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_OR_JUMP_EXACT1); else #endif if (IS_NOT_NULL(qn->next_head_exact)) r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH_IF_PEEK_NEXT); else r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_PUSH); } else { r = add_opcode_rel_addr(reg, OP_JUMP, SIZE_OP_JUMP); } if (r) return r; } else { r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; } if (qn->greedy) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) { r = add_opcode_rel_addr(reg, OP_PUSH_OR_JUMP_EXACT1, mod_tlen + SIZE_OP_JUMP); if (r) return r; add_bytes(reg, NSTR(qn->head_exact)->s, 1); r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_OR_JUMP_EXACT1)); } else #endif if (IS_NOT_NULL(qn->next_head_exact)) { r = add_opcode_rel_addr(reg, OP_PUSH_IF_PEEK_NEXT, mod_tlen + SIZE_OP_JUMP); if (r) return r; add_bytes(reg, NSTR(qn->next_head_exact)->s, 1); r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH_IF_PEEK_NEXT)); } else { r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); if (r) return r; r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, -(mod_tlen + (int )SIZE_OP_JUMP + (int )SIZE_OP_PUSH)); } } else { r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); if (r) return r; r = compile_tree_empty_check(qn->target, reg, empty_info); if (r) return r; r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); } } else if (qn->upper == 0 && qn->is_refered != 0) { /* /(?..){0}/ */ r = add_opcode_rel_addr(reg, OP_JUMP, tlen); if (r) return r; r = compile_tree(qn->target, reg); } else if (!infinite && qn->greedy && (qn->upper == 1 || (tlen + SIZE_OP_PUSH) * qn->upper <= QUANTIFIER_EXPAND_LIMIT_SIZE)) { int n = qn->upper - qn->lower; r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; for (i = 0; i < n; i++) { r = add_opcode_rel_addr(reg, OP_PUSH, (n - i) * tlen + (n - i - 1) * SIZE_OP_PUSH); if (r) return r; r = compile_tree(qn->target, reg); if (r) return r; } } else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, tlen); if (r) return r; r = compile_tree(qn->target, reg); } else { r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); } return r; } #endif /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_option_node(EncloseNode* node, regex_t* reg) { int tlen; OnigOptionType prev = reg->options; reg->options = node->option; tlen = compile_length_tree(node->target, reg); reg->options = prev; if (tlen < 0) return tlen; if (IS_DYNAMIC_OPTION(prev ^ node->option)) { return SIZE_OP_SET_OPTION_PUSH + SIZE_OP_SET_OPTION + SIZE_OP_FAIL + tlen + SIZE_OP_SET_OPTION; } else return tlen; } static int compile_option_node(EncloseNode* node, regex_t* reg) { int r; OnigOptionType prev = reg->options; if (IS_DYNAMIC_OPTION(prev ^ node->option)) { r = add_opcode_option(reg, OP_SET_OPTION_PUSH, node->option); if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); if (r) return r; r = add_opcode(reg, OP_FAIL); if (r) return r; } reg->options = node->option; r = compile_tree(node->target, reg); reg->options = prev; if (IS_DYNAMIC_OPTION(prev ^ node->option)) { if (r) return r; r = add_opcode_option(reg, OP_SET_OPTION, prev); } return r; } static int compile_length_enclose_node(EncloseNode* node, regex_t* reg) { int len; int tlen; if (node->type == ENCLOSE_OPTION) return compile_length_option_node(node, reg); if (node->target) { tlen = compile_length_tree(node->target, reg); if (tlen < 0) return tlen; } else tlen = 0; switch (node->type) { case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL if (IS_ENCLOSE_CALLED(node)) { len = SIZE_OP_MEMORY_START_PUSH + tlen + SIZE_OP_CALL + SIZE_OP_JUMP + SIZE_OP_RETURN; if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) len += (IS_ENCLOSE_RECURSION(node) ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); else len += (IS_ENCLOSE_RECURSION(node) ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); } else if (IS_ENCLOSE_RECURSION(node)) { len = SIZE_OP_MEMORY_START_PUSH; len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_REC); } else #endif { if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum)) len = SIZE_OP_MEMORY_START_PUSH; else len = SIZE_OP_MEMORY_START; len += tlen + (BIT_STATUS_AT(reg->bt_mem_end, node->regnum) ? SIZE_OP_MEMORY_END_PUSH : SIZE_OP_MEMORY_END); } break; case ENCLOSE_STOP_BACKTRACK: if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) { QtfrNode* qn = NQTFR(node->target); tlen = compile_length_tree(qn->target, reg); if (tlen < 0) return tlen; len = tlen * qn->lower + SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP; } else { len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT; } break; case ENCLOSE_CONDITION: len = SIZE_OP_CONDITION; if (NTYPE(node->target) == NT_ALT) { Node* x = node->target; tlen = compile_length_tree(NCAR(x), reg); /* yes-node */ if (tlen < 0) return tlen; len += tlen + SIZE_OP_JUMP; if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; x = NCDR(x); tlen = compile_length_tree(NCAR(x), reg); /* no-node */ if (tlen < 0) return tlen; len += tlen; if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; } else { return ONIGERR_PARSER_BUG; } break; default: return ONIGERR_TYPE_BUG; break; } return len; } static int get_char_length_tree(Node* node, regex_t* reg, int* len); static int compile_enclose_node(EncloseNode* node, regex_t* reg) { int r, len; if (node->type == ENCLOSE_OPTION) return compile_option_node(node, reg); switch (node->type) { case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL if (IS_ENCLOSE_CALLED(node)) { r = add_opcode(reg, OP_CALL); if (r) return r; node->call_addr = BBUF_GET_OFFSET_POS(reg) + SIZE_ABSADDR + SIZE_OP_JUMP; node->state |= NST_ADDR_FIXED; r = add_abs_addr(reg, (int )node->call_addr); if (r) return r; len = compile_length_tree(node->target, reg); len += (SIZE_OP_MEMORY_START_PUSH + SIZE_OP_RETURN); if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) len += (IS_ENCLOSE_RECURSION(node) ? SIZE_OP_MEMORY_END_PUSH_REC : SIZE_OP_MEMORY_END_PUSH); else len += (IS_ENCLOSE_RECURSION(node) ? SIZE_OP_MEMORY_END_REC : SIZE_OP_MEMORY_END); r = add_opcode_rel_addr(reg, OP_JUMP, len); if (r) return r; } #endif if (BIT_STATUS_AT(reg->bt_mem_start, node->regnum)) r = add_opcode(reg, OP_MEMORY_START_PUSH); else r = add_opcode(reg, OP_MEMORY_START); if (r) return r; r = add_mem_num(reg, node->regnum); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; #ifdef USE_SUBEXP_CALL if (IS_ENCLOSE_CALLED(node)) { if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node) ? OP_MEMORY_END_PUSH_REC : OP_MEMORY_END_PUSH)); else r = add_opcode(reg, (IS_ENCLOSE_RECURSION(node) ? OP_MEMORY_END_REC : OP_MEMORY_END)); if (r) return r; r = add_mem_num(reg, node->regnum); if (r) return r; r = add_opcode(reg, OP_RETURN); } else if (IS_ENCLOSE_RECURSION(node)) { if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) r = add_opcode(reg, OP_MEMORY_END_PUSH_REC); else r = add_opcode(reg, OP_MEMORY_END_REC); if (r) return r; r = add_mem_num(reg, node->regnum); } else #endif { if (BIT_STATUS_AT(reg->bt_mem_end, node->regnum)) r = add_opcode(reg, OP_MEMORY_END_PUSH); else r = add_opcode(reg, OP_MEMORY_END); if (r) return r; r = add_mem_num(reg, node->regnum); } break; case ENCLOSE_STOP_BACKTRACK: if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) { QtfrNode* qn = NQTFR(node->target); r = compile_tree_n_times(qn->target, qn->lower, reg); if (r) return r; len = compile_length_tree(qn->target, reg); if (len < 0) return len; r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_POP + SIZE_OP_JUMP); if (r) return r; r = compile_tree(qn->target, reg); if (r) return r; r = add_opcode(reg, OP_POP); if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, -((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP)); } else { r = add_opcode(reg, OP_PUSH_STOP_BT); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; r = add_opcode(reg, OP_POP_STOP_BT); } break; case ENCLOSE_CONDITION: r = add_opcode(reg, OP_CONDITION); if (r) return r; r = add_mem_num(reg, node->regnum); if (r) return r; if (NTYPE(node->target) == NT_ALT) { Node* x = node->target; int len2; len = compile_length_tree(NCAR(x), reg); /* yes-node */ if (len < 0) return len; if (NCDR(x) == NULL) return ONIGERR_PARSER_BUG; x = NCDR(x); len2 = compile_length_tree(NCAR(x), reg); /* no-node */ if (len2 < 0) return len2; if (NCDR(x) != NULL) return ONIGERR_INVALID_CONDITION_PATTERN; x = node->target; r = add_rel_addr(reg, len + SIZE_OP_JUMP); if (r) return r; r = compile_tree(NCAR(x), reg); /* yes-node */ if (r) return r; r = add_opcode_rel_addr(reg, OP_JUMP, len2); if (r) return r; x = NCDR(x); r = compile_tree(NCAR(x), reg); /* no-node */ } else { return ONIGERR_PARSER_BUG; } break; default: return ONIGERR_TYPE_BUG; break; } return r; } static int compile_length_anchor_node(AnchorNode* node, regex_t* reg) { int len; int tlen = 0; if (node->target) { tlen = compile_length_tree(node->target, reg); if (tlen < 0) return tlen; } switch (node->type) { case ANCHOR_PREC_READ: len = SIZE_OP_PUSH_POS + tlen + SIZE_OP_POP_POS; break; case ANCHOR_PREC_READ_NOT: len = SIZE_OP_PUSH_POS_NOT + tlen + SIZE_OP_FAIL_POS; break; case ANCHOR_LOOK_BEHIND: len = SIZE_OP_LOOK_BEHIND + tlen; break; case ANCHOR_LOOK_BEHIND_NOT: len = SIZE_OP_PUSH_LOOK_BEHIND_NOT + tlen + SIZE_OP_FAIL_LOOK_BEHIND_NOT; break; default: len = SIZE_OPCODE; break; } return len; } static int compile_anchor_node(AnchorNode* node, regex_t* reg) { int r, len; switch (node->type) { case ANCHOR_BEGIN_BUF: r = add_opcode(reg, OP_BEGIN_BUF); break; case ANCHOR_END_BUF: r = add_opcode(reg, OP_END_BUF); break; case ANCHOR_BEGIN_LINE: r = add_opcode(reg, OP_BEGIN_LINE); break; case ANCHOR_END_LINE: r = add_opcode(reg, OP_END_LINE); break; case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break; case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break; /* used for implicit anchor optimization: /.*a/ ==> /(?:^|\G).*a/ */ case ANCHOR_ANYCHAR_STAR: r = add_opcode(reg, OP_BEGIN_POS_OR_LINE); break; case ANCHOR_WORD_BOUND: if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND); else r = add_opcode(reg, OP_WORD_BOUND); break; case ANCHOR_NOT_WORD_BOUND: if (node->ascii_range) r = add_opcode(reg, OP_NOT_ASCII_WORD_BOUND); else r = add_opcode(reg, OP_NOT_WORD_BOUND); break; #ifdef USE_WORD_BEGIN_END case ANCHOR_WORD_BEGIN: if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BEGIN); else r = add_opcode(reg, OP_WORD_BEGIN); break; case ANCHOR_WORD_END: if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_END); else r = add_opcode(reg, OP_WORD_END); break; #endif case ANCHOR_KEEP: r = add_opcode(reg, OP_KEEP); break; case ANCHOR_PREC_READ: r = add_opcode(reg, OP_PUSH_POS); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; r = add_opcode(reg, OP_POP_POS); break; case ANCHOR_PREC_READ_NOT: len = compile_length_tree(node->target, reg); if (len < 0) return len; r = add_opcode_rel_addr(reg, OP_PUSH_POS_NOT, len + SIZE_OP_FAIL_POS); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; r = add_opcode(reg, OP_FAIL_POS); break; case ANCHOR_LOOK_BEHIND: { int n; r = add_opcode(reg, OP_LOOK_BEHIND); if (r) return r; if (node->char_len < 0) { r = get_char_length_tree(node->target, reg, &n); if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else n = node->char_len; r = add_length(reg, n); if (r) return r; r = compile_tree(node->target, reg); } break; case ANCHOR_LOOK_BEHIND_NOT: { int n; len = compile_length_tree(node->target, reg); r = add_opcode_rel_addr(reg, OP_PUSH_LOOK_BEHIND_NOT, len + SIZE_OP_FAIL_LOOK_BEHIND_NOT); if (r) return r; if (node->char_len < 0) { r = get_char_length_tree(node->target, reg, &n); if (r) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } else n = node->char_len; r = add_length(reg, n); if (r) return r; r = compile_tree(node->target, reg); if (r) return r; r = add_opcode(reg, OP_FAIL_LOOK_BEHIND_NOT); } break; default: return ONIGERR_TYPE_BUG; break; } return r; } static int compile_length_tree(Node* node, regex_t* reg) { int len, type, r; type = NTYPE(node); switch (type) { case NT_LIST: len = 0; do { r = compile_length_tree(NCAR(node), reg); if (r < 0) return r; len += r; } while (IS_NOT_NULL(node = NCDR(node))); r = len; break; case NT_ALT: { int n = 0; len = 0; do { r = compile_length_tree(NCAR(node), reg); if (r < 0) return r; len += r; n++; } while (IS_NOT_NULL(node = NCDR(node))); r = len; r += (SIZE_OP_PUSH + SIZE_OP_JUMP) * (n - 1); } break; case NT_STR: if (NSTRING_IS_RAW(node)) r = compile_length_string_raw_node(NSTR(node), reg); else r = compile_length_string_node(node, reg); break; case NT_CCLASS: r = compile_length_cclass_node(NCCLASS(node), reg); break; case NT_CTYPE: case NT_CANY: r = SIZE_OPCODE; break; case NT_BREF: { BRefNode* br = NBREF(node); #ifdef USE_BACKREF_WITH_LEVEL if (IS_BACKREF_NEST_LEVEL(br)) { r = SIZE_OPCODE + SIZE_OPTION + SIZE_LENGTH + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); } else #endif if (br->back_num == 1) { r = ((!IS_IGNORECASE(reg->options) && br->back_static[0] <= 2) ? SIZE_OPCODE : (SIZE_OPCODE + SIZE_MEMNUM)); } else { r = SIZE_OPCODE + SIZE_LENGTH + (SIZE_MEMNUM * br->back_num); } } break; #ifdef USE_SUBEXP_CALL case NT_CALL: r = SIZE_OP_CALL; break; #endif case NT_QTFR: r = compile_length_quantifier_node(NQTFR(node), reg); break; case NT_ENCLOSE: r = compile_length_enclose_node(NENCLOSE(node), reg); break; case NT_ANCHOR: r = compile_length_anchor_node(NANCHOR(node), reg); break; default: return ONIGERR_TYPE_BUG; break; } return r; } static int compile_tree(Node* node, regex_t* reg) { int n, type, len, pos, r = 0; type = NTYPE(node); switch (type) { case NT_LIST: do { r = compile_tree(NCAR(node), reg); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_ALT: { Node* x = node; len = 0; do { len += compile_length_tree(NCAR(x), reg); if (NCDR(x) != NULL) { len += SIZE_OP_PUSH + SIZE_OP_JUMP; } } while (IS_NOT_NULL(x = NCDR(x))); pos = reg->used + len; /* goal position */ do { len = compile_length_tree(NCAR(node), reg); if (IS_NOT_NULL(NCDR(node))) { r = add_opcode_rel_addr(reg, OP_PUSH, len + SIZE_OP_JUMP); if (r) break; } r = compile_tree(NCAR(node), reg); if (r) break; if (IS_NOT_NULL(NCDR(node))) { len = pos - (reg->used + SIZE_OP_JUMP); r = add_opcode_rel_addr(reg, OP_JUMP, len); if (r) break; } } while (IS_NOT_NULL(node = NCDR(node))); } break; case NT_STR: if (NSTRING_IS_RAW(node)) r = compile_string_raw_node(NSTR(node), reg); else r = compile_string_node(node, reg); break; case NT_CCLASS: r = compile_cclass_node(NCCLASS(node), reg); break; case NT_CTYPE: { int op; switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(node)->ascii_range != 0) { if (NCTYPE(node)->not != 0) op = OP_NOT_ASCII_WORD; else op = OP_ASCII_WORD; } else { if (NCTYPE(node)->not != 0) op = OP_NOT_WORD; else op = OP_WORD; } break; default: return ONIGERR_TYPE_BUG; break; } r = add_opcode(reg, op); } break; case NT_CANY: if (IS_MULTILINE(reg->options)) r = add_opcode(reg, OP_ANYCHAR_ML); else r = add_opcode(reg, OP_ANYCHAR); break; case NT_BREF: { BRefNode* br = NBREF(node); #ifdef USE_BACKREF_WITH_LEVEL if (IS_BACKREF_NEST_LEVEL(br)) { r = add_opcode(reg, OP_BACKREF_WITH_LEVEL); if (r) return r; r = add_option(reg, (reg->options & ONIG_OPTION_IGNORECASE)); if (r) return r; r = add_length(reg, br->nest_level); if (r) return r; goto add_bacref_mems; } else #endif if (br->back_num == 1) { n = br->back_static[0]; if (IS_IGNORECASE(reg->options)) { r = add_opcode(reg, OP_BACKREFN_IC); if (r) return r; r = add_mem_num(reg, n); } else { switch (n) { case 1: r = add_opcode(reg, OP_BACKREF1); break; case 2: r = add_opcode(reg, OP_BACKREF2); break; default: r = add_opcode(reg, OP_BACKREFN); if (r) return r; r = add_mem_num(reg, n); break; } } } else { int i; int* p; if (IS_IGNORECASE(reg->options)) { r = add_opcode(reg, OP_BACKREF_MULTI_IC); } else { r = add_opcode(reg, OP_BACKREF_MULTI); } if (r) return r; #ifdef USE_BACKREF_WITH_LEVEL add_bacref_mems: #endif r = add_length(reg, br->back_num); if (r) return r; p = BACKREFS_P(br); for (i = br->back_num - 1; i >= 0; i--) { r = add_mem_num(reg, p[i]); if (r) return r; } } } break; #ifdef USE_SUBEXP_CALL case NT_CALL: r = compile_call(NCALL(node), reg); break; #endif case NT_QTFR: r = compile_quantifier_node(NQTFR(node), reg); break; case NT_ENCLOSE: r = compile_enclose_node(NENCLOSE(node), reg); break; case NT_ANCHOR: r = compile_anchor_node(NANCHOR(node), reg); break; default: #ifdef ONIG_DEBUG fprintf(stderr, "compile_tree: undefined node type %d\n", NTYPE(node)); #endif break; } return r; } #ifdef USE_NAMED_GROUP static int noname_disable_map(Node** plink, GroupNumRemap* map, int* counter) { int r = 0; Node* node = *plink; switch (NTYPE(node)) { case NT_LIST: case NT_ALT: do { r = noname_disable_map(&(NCAR(node)), map, counter); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: { Node** ptarget = &(NQTFR(node)->target); Node* old = *ptarget; r = noname_disable_map(ptarget, map, counter); if (*ptarget != old && NTYPE(*ptarget) == NT_QTFR) { onig_reduce_nested_quantifier(node, *ptarget); } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); if (en->type == ENCLOSE_MEMORY) { if (IS_ENCLOSE_NAMED_GROUP(en)) { (*counter)++; map[en->regnum].new_val = *counter; en->regnum = *counter; } else if (en->regnum != 0) { *plink = en->target; en->target = NULL_NODE; onig_node_free(node); r = noname_disable_map(plink, map, counter); break; } } r = noname_disable_map(&(en->target), map, counter); } break; case NT_ANCHOR: if (NANCHOR(node)->target) r = noname_disable_map(&(NANCHOR(node)->target), map, counter); break; default: break; } return r; } static int renumber_node_backref(Node* node, GroupNumRemap* map) { int i, pos, n, old_num; int *backs; BRefNode* bn = NBREF(node); if (! IS_BACKREF_NAME_REF(bn)) return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; old_num = bn->back_num; if (IS_NULL(bn->back_dynamic)) backs = bn->back_static; else backs = bn->back_dynamic; for (i = 0, pos = 0; i < old_num; i++) { n = map[backs[i]].new_val; if (n > 0) { backs[pos] = n; pos++; } } bn->back_num = pos; return 0; } static int renumber_by_map(Node* node, GroupNumRemap* map) { int r = 0; switch (NTYPE(node)) { case NT_LIST: case NT_ALT: do { r = renumber_by_map(NCAR(node), map); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = renumber_by_map(NQTFR(node)->target, map); break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); if (en->type == ENCLOSE_CONDITION) en->regnum = map[en->regnum].new_val; r = renumber_by_map(en->target, map); } break; case NT_BREF: r = renumber_node_backref(node, map); break; case NT_ANCHOR: if (NANCHOR(node)->target) r = renumber_by_map(NANCHOR(node)->target, map); break; default: break; } return r; } static int numbered_ref_check(Node* node) { int r = 0; switch (NTYPE(node)) { case NT_LIST: case NT_ALT: do { r = numbered_ref_check(NCAR(node)); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = numbered_ref_check(NQTFR(node)->target); break; case NT_ENCLOSE: r = numbered_ref_check(NENCLOSE(node)->target); break; case NT_BREF: if (! IS_BACKREF_NAME_REF(NBREF(node))) return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; break; case NT_ANCHOR: if (NANCHOR(node)->target) r = numbered_ref_check(NANCHOR(node)->target); break; default: break; } return r; } static int disable_noname_group_capture(Node** root, regex_t* reg, ScanEnv* env) { int r, i, pos, counter; BitStatusType loc; GroupNumRemap* map; map = (GroupNumRemap* )xalloca(sizeof(GroupNumRemap) * (env->num_mem + 1)); CHECK_NULL_RETURN_MEMERR(map); for (i = 1; i <= env->num_mem; i++) { map[i].new_val = 0; } counter = 0; r = noname_disable_map(root, map, &counter); if (r != 0) return r; r = renumber_by_map(*root, map); if (r != 0) return r; for (i = 1, pos = 1; i <= env->num_mem; i++) { if (map[i].new_val > 0) { SCANENV_MEM_NODES(env)[pos] = SCANENV_MEM_NODES(env)[i]; pos++; } } loc = env->capture_history; BIT_STATUS_CLEAR(env->capture_history); for (i = 1; i <= ONIG_MAX_CAPTURE_HISTORY_GROUP; i++) { if (BIT_STATUS_AT(loc, i)) { BIT_STATUS_ON_AT_SIMPLE(env->capture_history, map[i].new_val); } } env->num_mem = env->num_named; reg->num_mem = env->num_named; return onig_renumber_name_table(reg, map); } #endif /* USE_NAMED_GROUP */ #ifdef USE_SUBEXP_CALL static int unset_addr_list_fix(UnsetAddrList* uslist, regex_t* reg) { int i, offset; EncloseNode* en; AbsAddrType addr; for (i = 0; i < uslist->num; i++) { en = NENCLOSE(uslist->us[i].target); if (! IS_ENCLOSE_ADDR_FIXED(en)) return ONIGERR_PARSER_BUG; addr = en->call_addr; offset = uslist->us[i].offset; BBUF_WRITE(reg, offset, &addr, SIZE_ABSADDR); } return 0; } #endif #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT static int quantifiers_memory_node_info(Node* node) { int r = 0; switch (NTYPE(node)) { case NT_LIST: case NT_ALT: { int v; do { v = quantifiers_memory_node_info(NCAR(node)); if (v > r) r = v; } while (v >= 0 && IS_NOT_NULL(node = NCDR(node))); } break; # ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) { return NQ_TARGET_IS_EMPTY_REC; /* tiny version */ } else r = quantifiers_memory_node_info(NCALL(node)->target); break; # endif case NT_QTFR: { QtfrNode* qn = NQTFR(node); if (qn->upper != 0) { r = quantifiers_memory_node_info(qn->target); } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: return NQ_TARGET_IS_EMPTY_MEM; break; case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: r = quantifiers_memory_node_info(en->target); break; default: break; } } break; case NT_BREF: case NT_STR: case NT_CTYPE: case NT_CCLASS: case NT_CANY: case NT_ANCHOR: default: break; } return r; } #endif /* USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT */ static int get_min_match_length(Node* node, OnigDistance *min, ScanEnv* env) { OnigDistance tmin; int r = 0; *min = 0; switch (NTYPE(node)) { case NT_BREF: { int i; int* backs; Node** nodes = SCANENV_MEM_NODES(env); BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) break; backs = BACKREFS_P(br); if (backs[0] > env->num_mem) return ONIGERR_INVALID_BACKREF; r = get_min_match_length(nodes[backs[0]], min, env); if (r != 0) break; for (i = 1; i < br->back_num; i++) { if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; r = get_min_match_length(nodes[backs[i]], &tmin, env); if (r != 0) break; if (*min > tmin) *min = tmin; } } break; #ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) { EncloseNode* en = NENCLOSE(NCALL(node)->target); if (IS_ENCLOSE_MIN_FIXED(en)) *min = en->min_len; } else r = get_min_match_length(NCALL(node)->target, min, env); break; #endif case NT_LIST: do { r = get_min_match_length(NCAR(node), &tmin, env); if (r == 0) *min += tmin; } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_ALT: { Node *x, *y; y = node; do { x = NCAR(y); r = get_min_match_length(x, &tmin, env); if (r != 0) break; if (y == node) *min = tmin; else if (*min > tmin) *min = tmin; } while (r == 0 && IS_NOT_NULL(y = NCDR(y))); } break; case NT_STR: { StrNode* sn = NSTR(node); *min = sn->end - sn->s; } break; case NT_CTYPE: *min = 1; break; case NT_CCLASS: case NT_CANY: *min = 1; break; case NT_QTFR: { QtfrNode* qn = NQTFR(node); if (qn->lower > 0) { r = get_min_match_length(qn->target, min, env); if (r == 0) *min = distance_multiply(*min, qn->lower); } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: if (IS_ENCLOSE_MIN_FIXED(en)) *min = en->min_len; else { if (IS_ENCLOSE_MARK1(NENCLOSE(node))) *min = 0; /* recursive */ else { SET_ENCLOSE_STATUS(node, NST_MARK1); r = get_min_match_length(en->target, min, env); CLEAR_ENCLOSE_STATUS(node, NST_MARK1); if (r == 0) { en->min_len = *min; SET_ENCLOSE_STATUS(node, NST_MIN_FIXED); } } } break; case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: r = get_min_match_length(en->target, min, env); break; } } break; case NT_ANCHOR: default: break; } return r; } static int get_max_match_length(Node* node, OnigDistance *max, ScanEnv* env) { OnigDistance tmax; int r = 0; *max = 0; switch (NTYPE(node)) { case NT_LIST: do { r = get_max_match_length(NCAR(node), &tmax, env); if (r == 0) *max = distance_add(*max, tmax); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_ALT: do { r = get_max_match_length(NCAR(node), &tmax, env); if (r == 0 && *max < tmax) *max = tmax; } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_STR: { StrNode* sn = NSTR(node); *max = sn->end - sn->s; } break; case NT_CTYPE: *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); break; case NT_CCLASS: case NT_CANY: *max = ONIGENC_MBC_MAXLEN_DIST(env->enc); break; case NT_BREF: { int i; int* backs; Node** nodes = SCANENV_MEM_NODES(env); BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) { *max = ONIG_INFINITE_DISTANCE; break; } backs = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { if (backs[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; r = get_max_match_length(nodes[backs[i]], &tmax, env); if (r != 0) break; if (*max < tmax) *max = tmax; } } break; #ifdef USE_SUBEXP_CALL case NT_CALL: if (! IS_CALL_RECURSION(NCALL(node))) r = get_max_match_length(NCALL(node)->target, max, env); else *max = ONIG_INFINITE_DISTANCE; break; #endif case NT_QTFR: { QtfrNode* qn = NQTFR(node); if (qn->upper != 0) { r = get_max_match_length(qn->target, max, env); if (r == 0 && *max != 0) { if (! IS_REPEAT_INFINITE(qn->upper)) *max = distance_multiply(*max, qn->upper); else *max = ONIG_INFINITE_DISTANCE; } } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: if (IS_ENCLOSE_MAX_FIXED(en)) *max = en->max_len; else { if (IS_ENCLOSE_MARK1(NENCLOSE(node))) *max = ONIG_INFINITE_DISTANCE; else { SET_ENCLOSE_STATUS(node, NST_MARK1); r = get_max_match_length(en->target, max, env); CLEAR_ENCLOSE_STATUS(node, NST_MARK1); if (r == 0) { en->max_len = *max; SET_ENCLOSE_STATUS(node, NST_MAX_FIXED); } } } break; case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: r = get_max_match_length(en->target, max, env); break; } } break; case NT_ANCHOR: default: break; } return r; } #define GET_CHAR_LEN_VARLEN -1 #define GET_CHAR_LEN_TOP_ALT_VARLEN -2 /* fixed size pattern node only */ static int get_char_length_tree1(Node* node, regex_t* reg, int* len, int level) { int tlen; int r = 0; level++; *len = 0; switch (NTYPE(node)) { case NT_LIST: do { r = get_char_length_tree1(NCAR(node), reg, &tlen, level); if (r == 0) *len = (int )distance_add(*len, tlen); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_ALT: { int tlen2; int varlen = 0; r = get_char_length_tree1(NCAR(node), reg, &tlen, level); while (r == 0 && IS_NOT_NULL(node = NCDR(node))) { r = get_char_length_tree1(NCAR(node), reg, &tlen2, level); if (r == 0) { if (tlen != tlen2) varlen = 1; } } if (r == 0) { if (varlen != 0) { if (level == 1) r = GET_CHAR_LEN_TOP_ALT_VARLEN; else r = GET_CHAR_LEN_VARLEN; } else *len = tlen; } } break; case NT_STR: { StrNode* sn = NSTR(node); UChar *s = sn->s; while (s < sn->end) { s += enclen(reg->enc, s, sn->end); (*len)++; } } break; case NT_QTFR: { QtfrNode* qn = NQTFR(node); if (qn->lower == qn->upper) { r = get_char_length_tree1(qn->target, reg, &tlen, level); if (r == 0) *len = (int )distance_multiply(tlen, qn->lower); } else r = GET_CHAR_LEN_VARLEN; } break; #ifdef USE_SUBEXP_CALL case NT_CALL: if (! IS_CALL_RECURSION(NCALL(node))) r = get_char_length_tree1(NCALL(node)->target, reg, len, level); else r = GET_CHAR_LEN_VARLEN; break; #endif case NT_CTYPE: *len = 1; break; case NT_CCLASS: case NT_CANY: *len = 1; break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL if (IS_ENCLOSE_CLEN_FIXED(en)) *len = en->char_len; else { r = get_char_length_tree1(en->target, reg, len, level); if (r == 0) { en->char_len = *len; SET_ENCLOSE_STATUS(node, NST_CLEN_FIXED); } } break; #endif case ENCLOSE_OPTION: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: r = get_char_length_tree1(en->target, reg, len, level); break; default: break; } } break; case NT_ANCHOR: break; default: r = GET_CHAR_LEN_VARLEN; break; } return r; } static int get_char_length_tree(Node* node, regex_t* reg, int* len) { return get_char_length_tree1(node, reg, len, 0); } /* x is not included y ==> 1 : 0 */ static int is_not_included(Node* x, Node* y, regex_t* reg) { int i; OnigDistance len; OnigCodePoint code; UChar *p; int ytype; retry: ytype = NTYPE(y); switch (NTYPE(x)) { case NT_CTYPE: { switch (ytype) { case NT_CTYPE: if (NCTYPE(y)->ctype == NCTYPE(x)->ctype && NCTYPE(y)->not != NCTYPE(x)->not && NCTYPE(y)->ascii_range == NCTYPE(x)->ascii_range) return 1; else return 0; break; case NT_CCLASS: swap: { Node* tmp; tmp = x; x = y; y = tmp; goto retry; } break; case NT_STR: goto swap; break; default: break; } } break; case NT_CCLASS: { CClassNode* xc = NCCLASS(x); switch (ytype) { case NT_CTYPE: switch (NCTYPE(y)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(y)->not == 0) { if (IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (BITSET_AT(xc->bs, i)) { if (NCTYPE(y)->ascii_range) { if (IS_CODE_SB_WORD(reg->enc, i)) return 0; } else { if (ONIGENC_IS_CODE_WORD(reg->enc, i)) return 0; } } } return 1; } return 0; } else { if (IS_NOT_NULL(xc->mbuf)) return 0; for (i = 0; i < SINGLE_BYTE_SIZE; i++) { int is_word; if (NCTYPE(y)->ascii_range) is_word = IS_CODE_SB_WORD(reg->enc, i); else is_word = ONIGENC_IS_CODE_WORD(reg->enc, i); if (! is_word) { if (!IS_NCCLASS_NOT(xc)) { if (BITSET_AT(xc->bs, i)) return 0; } else { if (! BITSET_AT(xc->bs, i)) return 0; } } } return 1; } break; default: break; } break; case NT_CCLASS: { int v; CClassNode* yc = NCCLASS(y); for (i = 0; i < SINGLE_BYTE_SIZE; i++) { v = BITSET_AT(xc->bs, i); if ((v != 0 && !IS_NCCLASS_NOT(xc)) || (v == 0 && IS_NCCLASS_NOT(xc))) { v = BITSET_AT(yc->bs, i); if ((v != 0 && !IS_NCCLASS_NOT(yc)) || (v == 0 && IS_NCCLASS_NOT(yc))) return 0; } } if ((IS_NULL(xc->mbuf) && !IS_NCCLASS_NOT(xc)) || (IS_NULL(yc->mbuf) && !IS_NCCLASS_NOT(yc))) return 1; return 0; } break; case NT_STR: goto swap; break; default: break; } } break; case NT_STR: { StrNode* xs = NSTR(x); if (NSTRING_LEN(x) == 0) break; switch (ytype) { case NT_CTYPE: switch (NCTYPE(y)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(y)->ascii_range) { if (ONIGENC_IS_MBC_ASCII_WORD(reg->enc, xs->s, xs->end)) return NCTYPE(y)->not; else return !(NCTYPE(y)->not); } else { if (ONIGENC_IS_MBC_WORD(reg->enc, xs->s, xs->end)) return NCTYPE(y)->not; else return !(NCTYPE(y)->not); } break; default: break; } break; case NT_CCLASS: { CClassNode* cc = NCCLASS(y); code = ONIGENC_MBC_TO_CODE(reg->enc, xs->s, xs->s + ONIGENC_MBC_MAXLEN(reg->enc)); return (onig_is_code_in_cc(reg->enc, code, cc) != 0 ? 0 : 1); } break; case NT_STR: { UChar *q; StrNode* ys = NSTR(y); len = NSTRING_LEN(x); if (len > NSTRING_LEN(y)) len = NSTRING_LEN(y); if (NSTRING_IS_AMBIG(x) || NSTRING_IS_AMBIG(y)) { /* tiny version */ return 0; } else { for (i = 0, p = ys->s, q = xs->s; (OnigDistance )i < len; i++, p++, q++) { if (*p != *q) return 1; } } } break; default: break; } } break; default: break; } return 0; } static Node* get_head_value_node(Node* node, int exact, regex_t* reg) { Node* n = NULL_NODE; switch (NTYPE(node)) { case NT_BREF: case NT_ALT: case NT_CANY: #ifdef USE_SUBEXP_CALL case NT_CALL: #endif break; case NT_CTYPE: case NT_CCLASS: if (exact == 0) { n = node; } break; case NT_LIST: n = get_head_value_node(NCAR(node), exact, reg); break; case NT_STR: { StrNode* sn = NSTR(node); if (sn->end <= sn->s) break; if (exact != 0 && !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { } else { n = node; } } break; case NT_QTFR: { QtfrNode* qn = NQTFR(node); if (qn->lower > 0) { #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (IS_NOT_NULL(qn->head_exact)) n = qn->head_exact; else #endif n = get_head_value_node(qn->target, exact, reg); } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_OPTION: { OnigOptionType options = reg->options; reg->options = NENCLOSE(node)->option; n = get_head_value_node(NENCLOSE(node)->target, exact, reg); reg->options = options; } break; case ENCLOSE_MEMORY: case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: n = get_head_value_node(en->target, exact, reg); break; } } break; case NT_ANCHOR: if (NANCHOR(node)->type == ANCHOR_PREC_READ) n = get_head_value_node(NANCHOR(node)->target, exact, reg); break; default: break; } return n; } static int check_type_tree(Node* node, int type_mask, int enclose_mask, int anchor_mask) { int type, r = 0; type = NTYPE(node); if ((NTYPE2BIT(type) & type_mask) == 0) return 1; switch (type) { case NT_LIST: case NT_ALT: do { r = check_type_tree(NCAR(node), type_mask, enclose_mask, anchor_mask); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = check_type_tree(NQTFR(node)->target, type_mask, enclose_mask, anchor_mask); break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); if ((en->type & enclose_mask) == 0) return 1; r = check_type_tree(en->target, type_mask, enclose_mask, anchor_mask); } break; case NT_ANCHOR: type = NANCHOR(node)->type; if ((type & anchor_mask) == 0) return 1; if (NANCHOR(node)->target) r = check_type_tree(NANCHOR(node)->target, type_mask, enclose_mask, anchor_mask); break; default: break; } return r; } #ifdef USE_SUBEXP_CALL # define RECURSION_EXIST 1 # define RECURSION_INFINITE 2 static int subexp_inf_recursive_check(Node* node, ScanEnv* env, int head) { int type; int r = 0; type = NTYPE(node); switch (type) { case NT_LIST: { Node *x; OnigDistance min; int ret; x = node; do { ret = subexp_inf_recursive_check(NCAR(x), env, head); if (ret < 0 || ret == RECURSION_INFINITE) return ret; r |= ret; if (head) { ret = get_min_match_length(NCAR(x), &min, env); if (ret != 0) return ret; if (min != 0) head = 0; } } while (IS_NOT_NULL(x = NCDR(x))); } break; case NT_ALT: { int ret; r = RECURSION_EXIST; do { ret = subexp_inf_recursive_check(NCAR(node), env, head); if (ret < 0 || ret == RECURSION_INFINITE) return ret; r &= ret; } while (IS_NOT_NULL(node = NCDR(node))); } break; case NT_QTFR: r = subexp_inf_recursive_check(NQTFR(node)->target, env, head); if (r == RECURSION_EXIST) { if (NQTFR(node)->lower == 0) r = 0; } break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: r = subexp_inf_recursive_check(an->target, env, head); break; } } break; case NT_CALL: r = subexp_inf_recursive_check(NCALL(node)->target, env, head); break; case NT_ENCLOSE: if (IS_ENCLOSE_MARK2(NENCLOSE(node))) return 0; else if (IS_ENCLOSE_MARK1(NENCLOSE(node))) return (head == 0 ? RECURSION_EXIST : RECURSION_INFINITE); else { SET_ENCLOSE_STATUS(node, NST_MARK2); r = subexp_inf_recursive_check(NENCLOSE(node)->target, env, head); CLEAR_ENCLOSE_STATUS(node, NST_MARK2); } break; default: break; } return r; } static int subexp_inf_recursive_check_trav(Node* node, ScanEnv* env) { int type; int r = 0; type = NTYPE(node); switch (type) { case NT_LIST: case NT_ALT: do { r = subexp_inf_recursive_check_trav(NCAR(node), env); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = subexp_inf_recursive_check_trav(NQTFR(node)->target, env); break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: r = subexp_inf_recursive_check_trav(an->target, env); break; } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); if (IS_ENCLOSE_RECURSION(en)) { SET_ENCLOSE_STATUS(node, NST_MARK1); r = subexp_inf_recursive_check(en->target, env, 1); if (r > 0) return ONIGERR_NEVER_ENDING_RECURSION; CLEAR_ENCLOSE_STATUS(node, NST_MARK1); } r = subexp_inf_recursive_check_trav(en->target, env); } break; default: break; } return r; } static int subexp_recursive_check(Node* node) { int r = 0; switch (NTYPE(node)) { case NT_LIST: case NT_ALT: do { r |= subexp_recursive_check(NCAR(node)); } while (IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = subexp_recursive_check(NQTFR(node)->target); break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: r = subexp_recursive_check(an->target); break; } } break; case NT_CALL: r = subexp_recursive_check(NCALL(node)->target); if (r != 0) SET_CALL_RECURSION(node); break; case NT_ENCLOSE: if (IS_ENCLOSE_MARK2(NENCLOSE(node))) return 0; else if (IS_ENCLOSE_MARK1(NENCLOSE(node))) return 1; /* recursion */ else { SET_ENCLOSE_STATUS(node, NST_MARK2); r = subexp_recursive_check(NENCLOSE(node)->target); CLEAR_ENCLOSE_STATUS(node, NST_MARK2); } break; default: break; } return r; } static int subexp_recursive_check_trav(Node* node, ScanEnv* env) { # define FOUND_CALLED_NODE 1 int type; int r = 0; type = NTYPE(node); switch (type) { case NT_LIST: case NT_ALT: { int ret; do { ret = subexp_recursive_check_trav(NCAR(node), env); if (ret == FOUND_CALLED_NODE) r = FOUND_CALLED_NODE; else if (ret < 0) return ret; } while (IS_NOT_NULL(node = NCDR(node))); } break; case NT_QTFR: r = subexp_recursive_check_trav(NQTFR(node)->target, env); if (NQTFR(node)->upper == 0) { if (r == FOUND_CALLED_NODE) NQTFR(node)->is_refered = 1; } break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: r = subexp_recursive_check_trav(an->target, env); break; } } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); if (! IS_ENCLOSE_RECURSION(en)) { if (IS_ENCLOSE_CALLED(en)) { SET_ENCLOSE_STATUS(node, NST_MARK1); r = subexp_recursive_check(en->target); if (r != 0) SET_ENCLOSE_STATUS(node, NST_RECURSION); CLEAR_ENCLOSE_STATUS(node, NST_MARK1); } } r = subexp_recursive_check_trav(en->target, env); if (IS_ENCLOSE_CALLED(en)) r |= FOUND_CALLED_NODE; } break; default: break; } return r; } static int setup_subexp_call(Node* node, ScanEnv* env) { int type; int r = 0; type = NTYPE(node); switch (type) { case NT_LIST: do { r = setup_subexp_call(NCAR(node), env); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_ALT: do { r = setup_subexp_call(NCAR(node), env); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_QTFR: r = setup_subexp_call(NQTFR(node)->target, env); break; case NT_ENCLOSE: r = setup_subexp_call(NENCLOSE(node)->target, env); break; case NT_CALL: { CallNode* cn = NCALL(node); Node** nodes = SCANENV_MEM_NODES(env); if (cn->group_num != 0) { int gnum = cn->group_num; # ifdef USE_NAMED_GROUP if (env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } # endif if (gnum > env->num_mem) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_GROUP_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_GROUP_REFERENCE; } # ifdef USE_NAMED_GROUP set_call_attr: # endif cn->target = nodes[cn->group_num]; if (IS_NULL(cn->target)) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } SET_ENCLOSE_STATUS(cn->target, NST_CALLED); BIT_STATUS_ON_AT(env->bt_mem_start, cn->group_num); cn->unset_addr_list = env->unset_addr_list; } # ifdef USE_NAMED_GROUP # ifdef USE_PERL_SUBEXP_CALL else if (cn->name == cn->name_end) { goto set_call_attr; } # endif else { int *refs; int n = onig_name_to_group_numbers(env->reg, cn->name, cn->name_end, &refs); if (n <= 0) { onig_scan_env_set_error_string(env, ONIGERR_UNDEFINED_NAME_REFERENCE, cn->name, cn->name_end); return ONIGERR_UNDEFINED_NAME_REFERENCE; } else if (n > 1 && ! IS_SYNTAX_BV(env->syntax, ONIG_SYN_ALLOW_MULTIPLEX_DEFINITION_NAME_CALL)) { onig_scan_env_set_error_string(env, ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL, cn->name, cn->name_end); return ONIGERR_MULTIPLEX_DEFINITION_NAME_CALL; } else { cn->group_num = refs[0]; goto set_call_attr; } } # endif } break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: case ANCHOR_PREC_READ_NOT: case ANCHOR_LOOK_BEHIND: case ANCHOR_LOOK_BEHIND_NOT: r = setup_subexp_call(an->target, env); break; } } break; default: break; } return r; } #endif /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) (? (?type; head = an->target; np = NCAR(head); swap_node(node, head); NCAR(node) = head; NANCHOR(head)->target = np; np = node; while ((np = NCDR(np)) != NULL_NODE) { insert_node = onig_node_new_anchor(anc_type); CHECK_NULL_RETURN_MEMERR(insert_node); NANCHOR(insert_node)->target = NCAR(np); NCAR(np) = insert_node; } if (anc_type == ANCHOR_LOOK_BEHIND_NOT) { np = node; do { SET_NTYPE(np, NT_LIST); /* alt -> list */ } while ((np = NCDR(np)) != NULL_NODE); } return 0; } static int setup_look_behind(Node* node, regex_t* reg, ScanEnv* env) { int r, len; AnchorNode* an = NANCHOR(node); r = get_char_length_tree(an->target, reg, &len); if (r == 0) an->char_len = len; else if (r == GET_CHAR_LEN_VARLEN) r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; else if (r == GET_CHAR_LEN_TOP_ALT_VARLEN) { if (IS_SYNTAX_BV(env->syntax, ONIG_SYN_DIFFERENT_LEN_ALT_LOOK_BEHIND)) r = divide_look_behind_alternatives(node); else r = ONIGERR_INVALID_LOOK_BEHIND_PATTERN; } return r; } static int next_setup(Node* node, Node* next_node, int in_root, regex_t* reg) { int type; retry: type = NTYPE(node); if (type == NT_QTFR) { QtfrNode* qn = NQTFR(node); if (qn->greedy && IS_REPEAT_INFINITE(qn->upper)) { #ifdef USE_QTFR_PEEK_NEXT Node* n = get_head_value_node(next_node, 1, reg); /* '\0': for UTF-16BE etc... */ if (IS_NOT_NULL(n) && NSTR(n)->s[0] != '\0') { qn->next_head_exact = n; } #endif /* automatic possessification a*b ==> (?>a*)b */ if (qn->lower <= 1) { int ttype = NTYPE(qn->target); if (IS_NODE_TYPE_SIMPLE(ttype)) { Node *x, *y; x = get_head_value_node(qn->target, 0, reg); if (IS_NOT_NULL(x)) { y = get_head_value_node(next_node, 0, reg); if (IS_NOT_NULL(y) && is_not_included(x, y, reg)) { Node* en = onig_node_new_enclose(ENCLOSE_STOP_BACKTRACK); CHECK_NULL_RETURN_MEMERR(en); SET_ENCLOSE_STATUS(en, NST_STOP_BT_SIMPLE_REPEAT); swap_node(node, en); NENCLOSE(node)->target = en; } } } } #ifndef ONIG_DONT_OPTIMIZE if (NTYPE(node) == NT_QTFR && /* the type may be changed by above block */ in_root && /* qn->lower == 0 && */ NTYPE(qn->target) == NT_CANY && ! IS_MULTILINE(reg->options)) { /* implicit anchor: /.*a/ ==> /(?:^|\G).*a/ */ Node *np; np = onig_node_new_list(NULL_NODE, NULL_NODE); CHECK_NULL_RETURN_MEMERR(np); swap_node(node, np); NCDR(node) = onig_node_new_list(np, NULL_NODE); if (IS_NULL(NCDR(node))) { onig_node_free(np); return ONIGERR_MEMORY; } np = onig_node_new_anchor(ANCHOR_ANYCHAR_STAR); /* (?:^|\G) */ CHECK_NULL_RETURN_MEMERR(np); NCAR(node) = np; } #endif } } else if (type == NT_ENCLOSE) { EncloseNode* en = NENCLOSE(node); in_root = 0; if (en->type == ENCLOSE_MEMORY) { node = en->target; goto retry; } } return 0; } static int update_string_node_case_fold(regex_t* reg, Node *node) { UChar *p, *end, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; UChar *sbuf, *ebuf, *sp; int r, i, len; OnigDistance sbuf_size; StrNode* sn = NSTR(node); end = sn->end; sbuf_size = (end - sn->s) * 2; sbuf = (UChar* )xmalloc(sbuf_size); CHECK_NULL_RETURN_MEMERR(sbuf); ebuf = sbuf + sbuf_size; sp = sbuf; p = sn->s; while (p < end) { len = ONIGENC_MBC_CASE_FOLD(reg->enc, reg->case_fold_flag, &p, end, buf); for (i = 0; i < len; i++) { if (sp >= ebuf) { UChar* p = (UChar* )xrealloc(sbuf, sbuf_size * 2); if (IS_NULL(p)) { xfree(sbuf); return ONIGERR_MEMORY; } sbuf = p; sp = sbuf + sbuf_size; sbuf_size *= 2; ebuf = sbuf + sbuf_size; } *sp++ = buf[i]; } } r = onig_node_str_set(node, sbuf, sp); xfree(sbuf); return r; } static int expand_case_fold_make_rem_string(Node** rnode, UChar *s, UChar *end, regex_t* reg) { int r; Node *node; node = onig_node_new_str(s, end); if (IS_NULL(node)) return ONIGERR_MEMORY; r = update_string_node_case_fold(reg, node); if (r != 0) { onig_node_free(node); return r; } NSTRING_SET_AMBIG(node); NSTRING_SET_DONT_GET_OPT_INFO(node); *rnode = node; return 0; } static int is_case_fold_variable_len(int item_num, OnigCaseFoldCodeItem items[], int slen) { int i; for (i = 0; i < item_num; i++) { if (items[i].byte_len != slen) { return 1; } if (items[i].code_len != 1) { return 1; } } return 0; } static int expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], UChar *p, int slen, UChar *end, regex_t* reg, Node **rnode) { int r, i, j, len, varlen; Node *anode, *var_anode, *snode, *xnode, *an; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; *rnode = var_anode = NULL_NODE; varlen = 0; for (i = 0; i < item_num; i++) { if (items[i].byte_len != slen) { varlen = 1; break; } } if (varlen != 0) { *rnode = var_anode = onig_node_new_alt(NULL_NODE, NULL_NODE); if (IS_NULL(var_anode)) return ONIGERR_MEMORY; xnode = onig_node_new_list(NULL, NULL); if (IS_NULL(xnode)) goto mem_err; NCAR(var_anode) = xnode; anode = onig_node_new_alt(NULL_NODE, NULL_NODE); if (IS_NULL(anode)) goto mem_err; NCAR(xnode) = anode; } else { *rnode = anode = onig_node_new_alt(NULL_NODE, NULL_NODE); if (IS_NULL(anode)) return ONIGERR_MEMORY; } snode = onig_node_new_str(p, p + slen); if (IS_NULL(snode)) goto mem_err; NCAR(anode) = snode; for (i = 0; i < item_num; i++) { snode = onig_node_new_str(NULL, NULL); if (IS_NULL(snode)) goto mem_err; for (j = 0; j < items[i].code_len; j++) { len = ONIGENC_CODE_TO_MBC(reg->enc, items[i].code[j], buf); if (len < 0) { r = len; goto mem_err2; } r = onig_node_str_cat(snode, buf, buf + len); if (r != 0) goto mem_err2; } an = onig_node_new_alt(NULL_NODE, NULL_NODE); if (IS_NULL(an)) { goto mem_err2; } if (items[i].byte_len != slen) { Node *rem; UChar *q = p + items[i].byte_len; if (q < end) { r = expand_case_fold_make_rem_string(&rem, q, end, reg); if (r != 0) { onig_node_free(an); goto mem_err2; } xnode = onig_node_list_add(NULL_NODE, snode); if (IS_NULL(xnode)) { onig_node_free(an); onig_node_free(rem); goto mem_err2; } if (IS_NULL(onig_node_list_add(xnode, rem))) { onig_node_free(an); onig_node_free(xnode); onig_node_free(rem); goto mem_err; } NCAR(an) = xnode; } else { NCAR(an) = snode; } NCDR(var_anode) = an; var_anode = an; } else { NCAR(an) = snode; NCDR(anode) = an; anode = an; } } return varlen; mem_err2: onig_node_free(snode); mem_err: onig_node_free(*rnode); return ONIGERR_MEMORY; } static int expand_case_fold_string(Node* node, regex_t* reg) { #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 int r, n, len, alt_num; int varlen = 0; UChar *start, *end, *p; Node *top_root, *root, *snode, *prev_node; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; StrNode* sn = NSTR(node); if (NSTRING_IS_AMBIG(node)) return 0; start = sn->s; end = sn->end; if (start >= end) return 0; r = 0; top_root = root = prev_node = snode = NULL_NODE; alt_num = 1; p = start; while (p < end) { n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(reg->enc, reg->case_fold_flag, p, end, items); if (n < 0) { r = n; goto err; } len = enclen(reg->enc, p, end); varlen = is_case_fold_variable_len(n, items, len); if (n == 0 || varlen == 0) { if (IS_NULL(snode)) { if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { top_root = root = onig_node_list_add(NULL_NODE, prev_node); if (IS_NULL(root)) { onig_node_free(prev_node); goto mem_err; } } prev_node = snode = onig_node_new_str(NULL, NULL); if (IS_NULL(snode)) goto mem_err; if (IS_NOT_NULL(root)) { if (IS_NULL(onig_node_list_add(root, snode))) { onig_node_free(snode); goto mem_err; } } } r = onig_node_str_cat(snode, p, p + len); if (r != 0) goto err; } else { alt_num *= (n + 1); if (alt_num > THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION) break; if (IS_NOT_NULL(snode)) { r = update_string_node_case_fold(reg, snode); if (r == 0) { NSTRING_SET_AMBIG(snode); } } if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { top_root = root = onig_node_list_add(NULL_NODE, prev_node); if (IS_NULL(root)) { onig_node_free(prev_node); goto mem_err; } } r = expand_case_fold_string_alt(n, items, p, len, end, reg, &prev_node); if (r < 0) goto mem_err; if (r == 1) { if (IS_NULL(root)) { top_root = prev_node; } else { if (IS_NULL(onig_node_list_add(root, prev_node))) { onig_node_free(prev_node); goto mem_err; } } root = NCAR(prev_node); } else { /* r == 0 */ if (IS_NOT_NULL(root)) { if (IS_NULL(onig_node_list_add(root, prev_node))) { onig_node_free(prev_node); goto mem_err; } } } snode = NULL_NODE; } p += len; } if (IS_NOT_NULL(snode)) { r = update_string_node_case_fold(reg, snode); if (r == 0) { NSTRING_SET_AMBIG(snode); } } if (p < end) { Node *srem; r = expand_case_fold_make_rem_string(&srem, p, end, reg); if (r != 0) goto mem_err; if (IS_NOT_NULL(prev_node) && IS_NULL(root)) { top_root = root = onig_node_list_add(NULL_NODE, prev_node); if (IS_NULL(root)) { onig_node_free(srem); onig_node_free(prev_node); goto mem_err; } } if (IS_NULL(root)) { prev_node = srem; } else { if (IS_NULL(onig_node_list_add(root, srem))) { onig_node_free(srem); goto mem_err; } } } /* ending */ top_root = (IS_NOT_NULL(top_root) ? top_root : prev_node); swap_node(node, top_root); onig_node_free(top_root); return 0; mem_err: r = ONIGERR_MEMORY; err: onig_node_free(top_root); return r; } #ifdef USE_COMBINATION_EXPLOSION_CHECK # define CEC_THRES_NUM_BIG_REPEAT 512 # define CEC_INFINITE_NUM 0x7fffffff # define CEC_IN_INFINITE_REPEAT (1<<0) # define CEC_IN_FINITE_REPEAT (1<<1) # define CEC_CONT_BIG_REPEAT (1<<2) static int setup_comb_exp_check(Node* node, int state, ScanEnv* env) { int type; int r = state; type = NTYPE(node); switch (type) { case NT_LIST: { Node* prev = NULL_NODE; do { r = setup_comb_exp_check(NCAR(node), r, env); prev = NCAR(node); } while (r >= 0 && IS_NOT_NULL(node = NCDR(node))); } break; case NT_ALT: { int ret; do { ret = setup_comb_exp_check(NCAR(node), state, env); r |= ret; } while (ret >= 0 && IS_NOT_NULL(node = NCDR(node))); } break; case NT_QTFR: { int child_state = state; int add_state = 0; QtfrNode* qn = NQTFR(node); Node* target = qn->target; int var_num; if (! IS_REPEAT_INFINITE(qn->upper)) { if (qn->upper > 1) { /* {0,1}, {1,1} are allowed */ child_state |= CEC_IN_FINITE_REPEAT; /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */ if (env->backrefed_mem == 0) { if (NTYPE(qn->target) == NT_ENCLOSE) { EncloseNode* en = NENCLOSE(qn->target); if (en->type == ENCLOSE_MEMORY) { if (NTYPE(en->target) == NT_QTFR) { QtfrNode* q = NQTFR(en->target); if (IS_REPEAT_INFINITE(q->upper) && q->greedy == qn->greedy) { qn->upper = (qn->lower == 0 ? 1 : qn->lower); if (qn->upper == 1) child_state = state; } } } } } } } if (state & CEC_IN_FINITE_REPEAT) { qn->comb_exp_check_num = -1; } else { if (IS_REPEAT_INFINITE(qn->upper)) { var_num = CEC_INFINITE_NUM; child_state |= CEC_IN_INFINITE_REPEAT; } else { var_num = qn->upper - qn->lower; } if (var_num >= CEC_THRES_NUM_BIG_REPEAT) add_state |= CEC_CONT_BIG_REPEAT; if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) || ((state & CEC_CONT_BIG_REPEAT) != 0 && var_num >= CEC_THRES_NUM_BIG_REPEAT)) { if (qn->comb_exp_check_num == 0) { env->num_comb_exp_check++; qn->comb_exp_check_num = env->num_comb_exp_check; if (env->curr_max_regnum > env->comb_exp_max_regnum) env->comb_exp_max_regnum = env->curr_max_regnum; } } } r = setup_comb_exp_check(target, child_state, env); r |= add_state; } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_MEMORY: { if (env->curr_max_regnum < en->regnum) env->curr_max_regnum = en->regnum; r = setup_comb_exp_check(en->target, state, env); } break; default: r = setup_comb_exp_check(en->target, state, env); break; } } break; # ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) env->has_recursion = 1; else r = setup_comb_exp_check(NCALL(node)->target, state, env); break; # endif default: break; } return r; } #endif #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REPEAT (1<<2) #define IN_VAR_REPEAT (1<<3) #define IN_ROOT (1<<4) #define IN_CALL (1<<5) #define IN_RECCALL (1<<6) /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) 2. expand ignore-case in char class. 3. set memory status bit flags. (reg->mem_stats) 4. set qn->head_exact for [push, exact] -> [push_or_jump_exact1, exact]. 5. find invalid patterns in look-behind. 6. expand repeated string. */ static int setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env) { int type; int r = 0; int in_root = state & IN_ROOT; state &= ~IN_ROOT; restart: type = NTYPE(node); switch (type) { case NT_LIST: { Node* prev = NULL_NODE; int prev_in_root = 0; state |= in_root; do { r = setup_tree(NCAR(node), reg, state, env); if (IS_NOT_NULL(prev) && r == 0) { r = next_setup(prev, NCAR(node), prev_in_root, reg); } prev = NCAR(node); prev_in_root = state & IN_ROOT; state &= ~IN_ROOT; } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); } break; case NT_ALT: do { r = setup_tree(NCAR(node), reg, (state | IN_ALT), env); } while (r == 0 && IS_NOT_NULL(node = NCDR(node))); break; case NT_CCLASS: break; case NT_STR: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { r = expand_case_fold_string(node, reg); } break; case NT_CTYPE: case NT_CANY: break; #ifdef USE_SUBEXP_CALL case NT_CALL: break; #endif case NT_BREF: { int i; int* p; Node** nodes = SCANENV_MEM_NODES(env); BRefNode* br = NBREF(node); p = BACKREFS_P(br); for (i = 0; i < br->back_num; i++) { if (p[i] > env->num_mem) return ONIGERR_INVALID_BACKREF; BIT_STATUS_ON_AT(env->backrefed_mem, p[i]); BIT_STATUS_ON_AT(env->bt_mem_start, p[i]); #ifdef USE_BACKREF_WITH_LEVEL if (IS_BACKREF_NEST_LEVEL(br)) { BIT_STATUS_ON_AT(env->bt_mem_end, p[i]); } #endif SET_ENCLOSE_STATUS(nodes[p[i]], NST_MEM_BACKREFED); } } break; case NT_QTFR: { OnigDistance d; QtfrNode* qn = NQTFR(node); Node* target = qn->target; if ((state & IN_REPEAT) != 0) { qn->state |= NST_IN_REPEAT; } if (IS_REPEAT_INFINITE(qn->upper) || qn->upper >= 1) { r = get_min_match_length(target, &d, env); if (r) break; if (d == 0) { qn->target_empty_info = NQ_TARGET_IS_EMPTY; #ifdef USE_MONOMANIAC_CHECK_CAPTURES_IN_ENDLESS_REPEAT r = quantifiers_memory_node_info(target); if (r < 0) break; if (r > 0) { qn->target_empty_info = r; } #endif #if 0 r = get_max_match_length(target, &d, env); if (r == 0 && d == 0) { /* ()* ==> ()?, ()+ ==> () */ qn->upper = 1; if (qn->lower > 1) qn->lower = 1; if (NTYPE(target) == NT_STR) { qn->upper = qn->lower = 0; /* /(?:)+/ ==> // */ } } #endif } } state |= IN_REPEAT; if (qn->lower != qn->upper) state |= IN_VAR_REPEAT; r = setup_tree(target, reg, state, env); if (r) break; /* expand string */ #define EXPAND_STRING_MAX_LENGTH 100 if (NTYPE(target) == NT_STR) { if (qn->lower > 1) { int i, n = qn->lower; OnigDistance len = NSTRING_LEN(target); StrNode* sn = NSTR(target); Node* np; np = onig_node_new_str(sn->s, sn->end); if (IS_NULL(np)) return ONIGERR_MEMORY; NSTR(np)->flag = sn->flag; for (i = 1; i < n && (i+1) * len <= EXPAND_STRING_MAX_LENGTH; i++) { r = onig_node_str_cat(np, sn->s, sn->end); if (r) { onig_node_free(np); return r; } } if (i < qn->upper || IS_REPEAT_INFINITE(qn->upper)) { Node *np1, *np2; qn->lower -= i; if (! IS_REPEAT_INFINITE(qn->upper)) qn->upper -= i; np1 = onig_node_new_list(np, NULL); if (IS_NULL(np1)) { onig_node_free(np); return ONIGERR_MEMORY; } swap_node(np1, node); np2 = onig_node_list_add(node, np1); if (IS_NULL(np2)) { onig_node_free(np1); return ONIGERR_MEMORY; } } else { swap_node(np, node); onig_node_free(np); } break; /* break case NT_QTFR: */ } } #ifdef USE_OP_PUSH_OR_JUMP_EXACT if (qn->greedy && (qn->target_empty_info != 0)) { if (NTYPE(target) == NT_QTFR) { QtfrNode* tqn = NQTFR(target); if (IS_NOT_NULL(tqn->head_exact)) { qn->head_exact = tqn->head_exact; tqn->head_exact = NULL; } } else { qn->head_exact = get_head_value_node(qn->target, 1, reg); } } #endif } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_OPTION: { OnigOptionType options = reg->options; state |= in_root; reg->options = NENCLOSE(node)->option; r = setup_tree(NENCLOSE(node)->target, reg, state, env); reg->options = options; } break; case ENCLOSE_MEMORY: if ((state & (IN_ALT | IN_NOT | IN_VAR_REPEAT | IN_CALL)) != 0) { BIT_STATUS_ON_AT(env->bt_mem_start, en->regnum); /* SET_ENCLOSE_STATUS(node, NST_MEM_IN_ALT_NOT); */ } if (IS_ENCLOSE_CALLED(en)) state |= IN_CALL; if (IS_ENCLOSE_RECURSION(en)) state |= IN_RECCALL; else if ((state & IN_RECCALL) != 0) SET_CALL_RECURSION(node); r = setup_tree(en->target, reg, state, env); break; case ENCLOSE_STOP_BACKTRACK: { Node* target = en->target; r = setup_tree(target, reg, state, env); if (NTYPE(target) == NT_QTFR) { QtfrNode* tqn = NQTFR(target); if (IS_REPEAT_INFINITE(tqn->upper) && tqn->lower <= 1 && tqn->greedy != 0) { /* (?>a*), a*+ etc... */ int qtype = NTYPE(tqn->target); if (IS_NODE_TYPE_SIMPLE(qtype)) SET_ENCLOSE_STATUS(node, NST_STOP_BT_SIMPLE_REPEAT); } } } break; case ENCLOSE_CONDITION: #ifdef USE_NAMED_GROUP if (! IS_ENCLOSE_NAME_REF(NENCLOSE(node)) && env->num_named > 0 && IS_SYNTAX_BV(env->syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && !ONIG_IS_OPTION_ON(env->option, ONIG_OPTION_CAPTURE_GROUP)) { return ONIGERR_NUMBERED_BACKREF_OR_CALL_NOT_ALLOWED; } #endif if (NENCLOSE(node)->regnum > env->num_mem) return ONIGERR_INVALID_BACKREF; r = setup_tree(NENCLOSE(node)->target, reg, state, env); break; } } break; case NT_ANCHOR: { AnchorNode* an = NANCHOR(node); switch (an->type) { case ANCHOR_PREC_READ: r = setup_tree(an->target, reg, state, env); break; case ANCHOR_PREC_READ_NOT: r = setup_tree(an->target, reg, (state | IN_NOT), env); break; /* allowed node types in look-behind */ #define ALLOWED_TYPE_IN_LB \ ( BIT_NT_LIST | BIT_NT_ALT | BIT_NT_STR | BIT_NT_CCLASS | BIT_NT_CTYPE | \ BIT_NT_CANY | BIT_NT_ANCHOR | BIT_NT_ENCLOSE | BIT_NT_QTFR | BIT_NT_CALL ) #define ALLOWED_ENCLOSE_IN_LB ( ENCLOSE_MEMORY | ENCLOSE_OPTION ) #define ALLOWED_ENCLOSE_IN_LB_NOT ENCLOSE_OPTION #define ALLOWED_ANCHOR_IN_LB \ ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) #define ALLOWED_ANCHOR_IN_LB_NOT \ ( ANCHOR_LOOK_BEHIND | ANCHOR_LOOK_BEHIND_NOT | ANCHOR_BEGIN_LINE | \ ANCHOR_END_LINE | ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_KEEP | \ ANCHOR_WORD_BOUND | ANCHOR_NOT_WORD_BOUND | \ ANCHOR_WORD_BEGIN | ANCHOR_WORD_END ) case ANCHOR_LOOK_BEHIND: { r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, ALLOWED_ENCLOSE_IN_LB, ALLOWED_ANCHOR_IN_LB); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; r = setup_tree(an->target, reg, state, env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } break; case ANCHOR_LOOK_BEHIND_NOT: { r = check_type_tree(an->target, ALLOWED_TYPE_IN_LB, ALLOWED_ENCLOSE_IN_LB_NOT, ALLOWED_ANCHOR_IN_LB_NOT); if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; r = setup_tree(an->target, reg, (state | IN_NOT), env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } break; } } break; default: break; } return r; } #ifndef USE_SUNDAY_QUICK_SEARCH /* set skip map for Boyer-Moore search */ static int set_bm_skip(UChar* s, UChar* end, regex_t* reg, UChar skip[], int** int_skip, int ignore_case) { OnigDistance i, len; int clen, flen, n, j, k; UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; OnigEncoding enc = reg->enc; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len; n = 0; for (i = 0; i < len - 1; i += clen) { p = s + i; if (ignore_case) n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) return 1; /* different length isn't supported. */ flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); if (flen != clen) return 1; /* different length isn't supported. */ } for (j = 0; j < clen; j++) { skip[s[i + j]] = (UChar )(len - 1 - i - j); for (k = 0; k < n; k++) { skip[buf[k][j]] = (UChar )(len - 1 - i - j); } } } } else { # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE /* This should not happen. */ return ONIGERR_TYPE_BUG; # else if (IS_NULL(*int_skip)) { *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len; n = 0; for (i = 0; i < len - 1; i += clen) { p = s + i; if (ignore_case) n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) return 1; /* different length isn't supported. */ flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); if (flen != clen) return 1; /* different length isn't supported. */ } for (j = 0; j < clen; j++) { (*int_skip)[s[i + j]] = (int )(len - 1 - i - j); for (k = 0; k < n; k++) { (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j); } } } # endif } return 0; } #else /* USE_SUNDAY_QUICK_SEARCH */ /* set skip map for Sunday's quick search */ static int set_bm_skip(UChar* s, UChar* end, regex_t* reg, UChar skip[], int** int_skip, int ignore_case) { OnigDistance i, len; int clen, flen, n, j, k; UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; OnigEncoding enc = reg->enc; len = end - s; if (len < ONIG_CHAR_TABLE_SIZE) { for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1); n = 0; for (i = 0; i < len; i += clen) { p = s + i; if (ignore_case) n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) return 1; /* different length isn't supported. */ flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); if (flen != clen) return 1; /* different length isn't supported. */ } for (j = 0; j < clen; j++) { skip[s[i + j]] = (UChar )(len - i - j); for (k = 0; k < n; k++) { skip[buf[k][j]] = (UChar )(len - i - j); } } } } else { # if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE /* This should not happen. */ return ONIGERR_TYPE_BUG; # else if (IS_NULL(*int_skip)) { *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; } for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1); n = 0; for (i = 0; i < len; i += clen) { p = s + i; if (ignore_case) n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, p, end, items); clen = enclen(enc, p, end); for (j = 0; j < n; j++) { if ((items[j].code_len != 1) || (items[j].byte_len != clen)) return 1; /* different length isn't supported. */ flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); if (flen != clen) return 1; /* different length isn't supported. */ } for (j = 0; j < clen; j++) { (*int_skip)[s[i + j]] = (int )(len - i - j); for (k = 0; k < n; k++) { (*int_skip)[buf[k][j]] = (int )(len - i - j); } } } # endif } return 0; } #endif /* USE_SUNDAY_QUICK_SEARCH */ typedef struct { OnigDistance min; /* min byte length */ OnigDistance max; /* max byte length */ } MinMaxLen; typedef struct { MinMaxLen mmd; OnigEncoding enc; OnigOptionType options; OnigCaseFoldType case_fold_flag; ScanEnv* scan_env; } OptEnv; typedef struct { int left_anchor; int right_anchor; } OptAncInfo; typedef struct { MinMaxLen mmd; /* info position */ OptAncInfo anc; int reach_end; int ignore_case; /* -1: unset, 0: case sensitive, 1: ignore case */ int len; UChar s[OPT_EXACT_MAXLEN]; } OptExactInfo; typedef struct { MinMaxLen mmd; /* info position */ OptAncInfo anc; int value; /* weighted value */ UChar map[ONIG_CHAR_TABLE_SIZE]; } OptMapInfo; typedef struct { MinMaxLen len; OptAncInfo anc; OptExactInfo exb; /* boundary */ OptExactInfo exm; /* middle */ OptExactInfo expr; /* prec read (?=...) */ OptMapInfo map; /* boundary */ } NodeOptInfo; static int map_position_value(OnigEncoding enc, int i) { static const short int ByteValTable[] = { 5, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 1, 1, 10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 12, 4, 7, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 1 }; if (i < numberof(ByteValTable)) { if (i == 0 && ONIGENC_MBC_MINLEN(enc) > 1) return 20; else return (int )ByteValTable[i]; } else return 4; /* Take it easy. */ } static int distance_value(MinMaxLen* mm) { /* 1000 / (min-max-dist + 1) */ static const short int dist_vals[] = { 1000, 500, 333, 250, 200, 167, 143, 125, 111, 100, 91, 83, 77, 71, 67, 63, 59, 56, 53, 50, 48, 45, 43, 42, 40, 38, 37, 36, 34, 33, 32, 31, 30, 29, 29, 28, 27, 26, 26, 25, 24, 24, 23, 23, 22, 22, 21, 21, 20, 20, 20, 19, 19, 19, 18, 18, 18, 17, 17, 17, 16, 16, 16, 16, 15, 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 13, 13, 12, 12, 12, 12, 12, 12, 11, 11, 11, 11, 11, 11, 11, 11, 11, 10, 10, 10, 10, 10 }; OnigDistance d; if (mm->max == ONIG_INFINITE_DISTANCE) return 0; d = mm->max - mm->min; if (d < numberof(dist_vals)) /* return dist_vals[d] * 16 / (mm->min + 12); */ return (int )dist_vals[d]; else return 1; } static int comp_distance_value(MinMaxLen* d1, MinMaxLen* d2, int v1, int v2) { if (v2 <= 0) return -1; if (v1 <= 0) return 1; v1 *= distance_value(d1); v2 *= distance_value(d2); if (v2 > v1) return 1; if (v2 < v1) return -1; if (d2->min < d1->min) return 1; if (d2->min > d1->min) return -1; return 0; } static int is_equal_mml(MinMaxLen* a, MinMaxLen* b) { return (a->min == b->min && a->max == b->max) ? 1 : 0; } static void set_mml(MinMaxLen* mml, OnigDistance min, OnigDistance max) { mml->min = min; mml->max = max; } static void clear_mml(MinMaxLen* mml) { mml->min = mml->max = 0; } static void copy_mml(MinMaxLen* to, MinMaxLen* from) { to->min = from->min; to->max = from->max; } static void add_mml(MinMaxLen* to, MinMaxLen* from) { to->min = distance_add(to->min, from->min); to->max = distance_add(to->max, from->max); } #if 0 static void add_len_mml(MinMaxLen* to, OnigDistance len) { to->min = distance_add(to->min, len); to->max = distance_add(to->max, len); } #endif static void alt_merge_mml(MinMaxLen* to, MinMaxLen* from) { if (to->min > from->min) to->min = from->min; if (to->max < from->max) to->max = from->max; } static void copy_opt_env(OptEnv* to, OptEnv* from) { *to = *from; } static void clear_opt_anc_info(OptAncInfo* anc) { anc->left_anchor = 0; anc->right_anchor = 0; } static void copy_opt_anc_info(OptAncInfo* to, OptAncInfo* from) { *to = *from; } static void concat_opt_anc_info(OptAncInfo* to, OptAncInfo* left, OptAncInfo* right, OnigDistance left_len, OnigDistance right_len) { clear_opt_anc_info(to); to->left_anchor = left->left_anchor; if (left_len == 0) { to->left_anchor |= right->left_anchor; } to->right_anchor = right->right_anchor; if (right_len == 0) { to->right_anchor |= left->right_anchor; } else { to->right_anchor |= (left->right_anchor & ANCHOR_PREC_READ_NOT); } } static int is_left_anchor(int anc) { if (anc == ANCHOR_END_BUF || anc == ANCHOR_SEMI_END_BUF || anc == ANCHOR_END_LINE || anc == ANCHOR_PREC_READ || anc == ANCHOR_PREC_READ_NOT) return 0; return 1; } static int is_set_opt_anc_info(OptAncInfo* to, int anc) { if ((to->left_anchor & anc) != 0) return 1; return ((to->right_anchor & anc) != 0 ? 1 : 0); } static void add_opt_anc_info(OptAncInfo* to, int anc) { if (is_left_anchor(anc)) to->left_anchor |= anc; else to->right_anchor |= anc; } static void remove_opt_anc_info(OptAncInfo* to, int anc) { if (is_left_anchor(anc)) to->left_anchor &= ~anc; else to->right_anchor &= ~anc; } static void alt_merge_opt_anc_info(OptAncInfo* to, OptAncInfo* add) { to->left_anchor &= add->left_anchor; to->right_anchor &= add->right_anchor; } static int is_full_opt_exact_info(OptExactInfo* ex) { return (ex->len >= OPT_EXACT_MAXLEN ? 1 : 0); } static void clear_opt_exact_info(OptExactInfo* ex) { clear_mml(&ex->mmd); clear_opt_anc_info(&ex->anc); ex->reach_end = 0; ex->ignore_case = -1; /* unset */ ex->len = 0; ex->s[0] = '\0'; } static void copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) { *to = *from; } static void concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc) { int i, j, len; UChar *p, *end; OptAncInfo tanc; if (to->ignore_case < 0) to->ignore_case = add->ignore_case; else if (to->ignore_case != add->ignore_case) return ; /* avoid */ p = add->s; end = p + add->len; for (i = to->len; p < end; ) { len = enclen(enc, p, end); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len && p < end; j++) to->s[i++] = *p++; } to->len = i; to->reach_end = (p == end ? add->reach_end : 0); concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); if (! to->reach_end) tanc.right_anchor = 0; copy_opt_anc_info(&to->anc, &tanc); } static void concat_opt_exact_info_str(OptExactInfo* to, UChar* s, UChar* end, int raw ARG_UNUSED, OnigEncoding enc) { int i, j, len; UChar *p; for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { len = enclen(enc, p, end); if (i + len > OPT_EXACT_MAXLEN) break; for (j = 0; j < len && p < end; j++) to->s[i++] = *p++; } to->len = i; } static void alt_merge_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OptEnv* env) { int i, j, len; if (add->len == 0 || to->len == 0) { clear_opt_exact_info(to); return ; } if (! is_equal_mml(&to->mmd, &add->mmd)) { clear_opt_exact_info(to); return ; } for (i = 0; i < to->len && i < add->len; ) { if (to->s[i] != add->s[i]) break; len = enclen(env->enc, to->s + i, to->s + to->len); for (j = 1; j < len; j++) { if (to->s[i+j] != add->s[i+j]) break; } if (j < len) break; i += len; } if (! add->reach_end || i < add->len || i < to->len) { to->reach_end = 0; } to->len = i; if (to->ignore_case < 0) to->ignore_case = add->ignore_case; else if (add->ignore_case >= 0) to->ignore_case |= add->ignore_case; alt_merge_opt_anc_info(&to->anc, &add->anc); if (! to->reach_end) to->anc.right_anchor = 0; } static void select_opt_exact_info(OnigEncoding enc, OptExactInfo* now, OptExactInfo* alt) { int v1, v2; v1 = now->len; v2 = alt->len; if (v2 == 0) { return ; } else if (v1 == 0) { copy_opt_exact_info(now, alt); return ; } else if (v1 <= 2 && v2 <= 2) { /* ByteValTable[x] is big value --> low price */ v2 = map_position_value(enc, now->s[0]); v1 = map_position_value(enc, alt->s[0]); if (now->len > 1) v1 += 5; if (alt->len > 1) v2 += 5; } if (now->ignore_case <= 0) v1 *= 2; if (alt->ignore_case <= 0) v2 *= 2; if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_exact_info(now, alt); } static void clear_opt_map_info(OptMapInfo* map) { static const OptMapInfo clean_info = { {0, 0}, {0, 0}, 0, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 } }; xmemcpy(map, &clean_info, sizeof(OptMapInfo)); } static void copy_opt_map_info(OptMapInfo* to, OptMapInfo* from) { *to = *from; } static void add_char_opt_map_info(OptMapInfo* map, UChar c, OnigEncoding enc) { if (map->map[c] == 0) { map->map[c] = 1; map->value += map_position_value(enc, c); } } static int add_char_amb_opt_map_info(OptMapInfo* map, UChar* p, UChar* end, OnigEncoding enc, OnigCaseFoldType case_fold_flag) { OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; UChar buf[ONIGENC_CODE_TO_MBC_MAXLEN]; int i, n; add_char_opt_map_info(map, p[0], enc); case_fold_flag = DISABLE_CASE_FOLD_MULTI_CHAR(case_fold_flag); n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, case_fold_flag, p, end, items); if (n < 0) return n; for (i = 0; i < n; i++) { ONIGENC_CODE_TO_MBC(enc, items[i].code[0], buf); add_char_opt_map_info(map, buf[0], enc); } return 0; } static void select_opt_map_info(OptMapInfo* now, OptMapInfo* alt) { const int z = 1<<15; /* 32768: something big value */ int v1, v2; if (alt->value == 0) return ; if (now->value == 0) { copy_opt_map_info(now, alt); return ; } v1 = z / now->value; v2 = z / alt->value; if (comp_distance_value(&now->mmd, &alt->mmd, v1, v2) > 0) copy_opt_map_info(now, alt); } static int comp_opt_exact_or_map_info(OptExactInfo* e, OptMapInfo* m) { #define COMP_EM_BASE 20 int ve, vm; if (m->value <= 0) return -1; ve = COMP_EM_BASE * e->len * (e->ignore_case > 0 ? 1 : 2); vm = COMP_EM_BASE * 5 * 2 / m->value; return comp_distance_value(&e->mmd, &m->mmd, ve, vm); } static void alt_merge_opt_map_info(OnigEncoding enc, OptMapInfo* to, OptMapInfo* add) { int i, val; /* if (! is_equal_mml(&to->mmd, &add->mmd)) return ; */ if (to->value == 0) return ; if (add->value == 0 || to->mmd.max < add->mmd.min) { clear_opt_map_info(to); return ; } alt_merge_mml(&to->mmd, &add->mmd); val = 0; for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { if (add->map[i]) to->map[i] = 1; if (to->map[i]) val += map_position_value(enc, i); } to->value = val; alt_merge_opt_anc_info(&to->anc, &add->anc); } static void set_bound_node_opt_info(NodeOptInfo* opt, MinMaxLen* mmd) { copy_mml(&(opt->exb.mmd), mmd); copy_mml(&(opt->expr.mmd), mmd); copy_mml(&(opt->map.mmd), mmd); } static void clear_node_opt_info(NodeOptInfo* opt) { clear_mml(&opt->len); clear_opt_anc_info(&opt->anc); clear_opt_exact_info(&opt->exb); clear_opt_exact_info(&opt->exm); clear_opt_exact_info(&opt->expr); clear_opt_map_info(&opt->map); } static void copy_node_opt_info(NodeOptInfo* to, NodeOptInfo* from) { *to = *from; } static void concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) { int exb_reach, exm_reach; OptAncInfo tanc; concat_opt_anc_info(&tanc, &to->anc, &add->anc, to->len.max, add->len.max); copy_opt_anc_info(&to->anc, &tanc); if (add->exb.len > 0 && to->len.max == 0) { concat_opt_anc_info(&tanc, &to->anc, &add->exb.anc, to->len.max, add->len.max); copy_opt_anc_info(&add->exb.anc, &tanc); } if (add->map.value > 0 && to->len.max == 0) { if (add->map.mmd.max == 0) add->map.anc.left_anchor |= to->anc.left_anchor; } exb_reach = to->exb.reach_end; exm_reach = to->exm.reach_end; if (add->len.max != 0) to->exb.reach_end = to->exm.reach_end = 0; if (add->exb.len > 0) { if (exb_reach) { concat_opt_exact_info(&to->exb, &add->exb, enc); clear_opt_exact_info(&add->exb); } else if (exm_reach) { concat_opt_exact_info(&to->exm, &add->exb, enc); clear_opt_exact_info(&add->exb); } } select_opt_exact_info(enc, &to->exm, &add->exb); select_opt_exact_info(enc, &to->exm, &add->exm); if (to->expr.len > 0) { if (add->len.max > 0) { if (to->expr.len > (int )add->len.max) to->expr.len = (int )add->len.max; if (to->expr.mmd.max == 0) select_opt_exact_info(enc, &to->exb, &to->expr); else select_opt_exact_info(enc, &to->exm, &to->expr); } } else if (add->expr.len > 0) { copy_opt_exact_info(&to->expr, &add->expr); } select_opt_map_info(&to->map, &add->map); add_mml(&to->len, &add->len); } static void alt_merge_node_opt_info(NodeOptInfo* to, NodeOptInfo* add, OptEnv* env) { alt_merge_opt_anc_info (&to->anc, &add->anc); alt_merge_opt_exact_info(&to->exb, &add->exb, env); alt_merge_opt_exact_info(&to->exm, &add->exm, env); alt_merge_opt_exact_info(&to->expr, &add->expr, env); alt_merge_opt_map_info(env->enc, &to->map, &add->map); alt_merge_mml(&to->len, &add->len); } #define MAX_NODE_OPT_INFO_REF_COUNT 5 static int optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) { int type; int r = 0; clear_node_opt_info(opt); set_bound_node_opt_info(opt, &env->mmd); type = NTYPE(node); switch (type) { case NT_LIST: { OptEnv nenv; NodeOptInfo nopt; Node* nd = node; copy_opt_env(&nenv, env); do { r = optimize_node_left(NCAR(nd), &nopt, &nenv); if (r == 0) { add_mml(&nenv.mmd, &nopt.len); concat_left_node_opt_info(env->enc, opt, &nopt); } } while (r == 0 && IS_NOT_NULL(nd = NCDR(nd))); } break; case NT_ALT: { NodeOptInfo nopt; Node* nd = node; do { r = optimize_node_left(NCAR(nd), &nopt, env); if (r == 0) { if (nd == node) copy_node_opt_info(opt, &nopt); else alt_merge_node_opt_info(opt, &nopt, env); } } while ((r == 0) && IS_NOT_NULL(nd = NCDR(nd))); } break; case NT_STR: { StrNode* sn = NSTR(node); OnigDistance slen = sn->end - sn->s; int is_raw = NSTRING_IS_RAW(node); if (! NSTRING_IS_AMBIG(node)) { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, is_raw, env->enc); opt->exb.ignore_case = 0; if (slen > 0) { add_char_opt_map_info(&opt->map, *(sn->s), env->enc); } set_mml(&opt->len, slen, slen); } else { OnigDistance max; if (NSTRING_IS_DONT_GET_OPT_INFO(node)) { int n = onigenc_strlen(env->enc, sn->s, sn->end); max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * n; } else { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, is_raw, env->enc); opt->exb.ignore_case = 1; if (slen > 0) { r = add_char_amb_opt_map_info(&opt->map, sn->s, sn->end, env->enc, env->case_fold_flag); if (r != 0) break; } max = slen; } set_mml(&opt->len, slen, max); } if ((OnigDistance )opt->exb.len == slen) opt->exb.reach_end = 1; } break; case NT_CCLASS: { int i, z; CClassNode* cc = NCCLASS(node); /* no need to check ignore case. (set in setup_tree()) */ if (IS_NOT_NULL(cc->mbuf) || IS_NCCLASS_NOT(cc)) { OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); set_mml(&opt->len, min, max); } else { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { z = BITSET_AT(cc->bs, i); if ((z && !IS_NCCLASS_NOT(cc)) || (!z && IS_NCCLASS_NOT(cc))) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } } set_mml(&opt->len, 1, 1); } } break; case NT_CTYPE: { int i, min, max; int maxcode; max = ONIGENC_MBC_MAXLEN_DIST(env->enc); if (max == 1) { min = 1; maxcode = NCTYPE(node)->ascii_range ? 0x80 : SINGLE_BYTE_SIZE; switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(node)->not != 0) { for (i = 0; i < SINGLE_BYTE_SIZE; i++) { if (! ONIGENC_IS_CODE_WORD(env->enc, i) || i >= maxcode) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } } } else { for (i = 0; i < maxcode; i++) { if (ONIGENC_IS_CODE_WORD(env->enc, i)) { add_char_opt_map_info(&opt->map, (UChar )i, env->enc); } } } break; } } else { min = ONIGENC_MBC_MINLEN(env->enc); } set_mml(&opt->len, min, max); } break; case NT_CANY: { OnigDistance min = ONIGENC_MBC_MINLEN(env->enc); OnigDistance max = ONIGENC_MBC_MAXLEN_DIST(env->enc); set_mml(&opt->len, min, max); } break; case NT_ANCHOR: switch (NANCHOR(node)->type) { case ANCHOR_BEGIN_BUF: case ANCHOR_BEGIN_POSITION: case ANCHOR_BEGIN_LINE: case ANCHOR_END_BUF: case ANCHOR_SEMI_END_BUF: case ANCHOR_END_LINE: case ANCHOR_LOOK_BEHIND: /* just for (?<=x).* */ case ANCHOR_PREC_READ_NOT: /* just for (?!x).* */ add_opt_anc_info(&opt->anc, NANCHOR(node)->type); break; case ANCHOR_PREC_READ: { NodeOptInfo nopt; r = optimize_node_left(NANCHOR(node)->target, &nopt, env); if (r == 0) { if (nopt.exb.len > 0) copy_opt_exact_info(&opt->expr, &nopt.exb); else if (nopt.exm.len > 0) copy_opt_exact_info(&opt->expr, &nopt.exm); opt->expr.reach_end = 0; if (nopt.map.value > 0) copy_opt_map_info(&opt->map, &nopt.map); } } break; case ANCHOR_LOOK_BEHIND_NOT: break; } break; case NT_BREF: { int i; int* backs; OnigDistance min, max, tmin, tmax; Node** nodes = SCANENV_MEM_NODES(env->scan_env); BRefNode* br = NBREF(node); if (br->state & NST_RECURSION) { set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); break; } backs = BACKREFS_P(br); r = get_min_match_length(nodes[backs[0]], &min, env->scan_env); if (r != 0) break; r = get_max_match_length(nodes[backs[0]], &max, env->scan_env); if (r != 0) break; for (i = 1; i < br->back_num; i++) { r = get_min_match_length(nodes[backs[i]], &tmin, env->scan_env); if (r != 0) break; r = get_max_match_length(nodes[backs[i]], &tmax, env->scan_env); if (r != 0) break; if (min > tmin) min = tmin; if (max < tmax) max = tmax; } if (r == 0) set_mml(&opt->len, min, max); } break; #ifdef USE_SUBEXP_CALL case NT_CALL: if (IS_CALL_RECURSION(NCALL(node))) set_mml(&opt->len, 0, ONIG_INFINITE_DISTANCE); else { OnigOptionType save = env->options; env->options = NENCLOSE(NCALL(node)->target)->option; r = optimize_node_left(NCALL(node)->target, opt, env); env->options = save; } break; #endif case NT_QTFR: { int i; OnigDistance min, max; NodeOptInfo nopt; QtfrNode* qn = NQTFR(node); r = optimize_node_left(qn->target, &nopt, env); if (r) break; if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) { if (env->mmd.max == 0 && NTYPE(qn->target) == NT_CANY && qn->greedy) { if (IS_MULTILINE(env->options)) /* implicit anchor: /.*a/ ==> /\A.*a/ */ add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_ML); else add_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR); } } else { if (qn->lower > 0) { copy_node_opt_info(opt, &nopt); if (nopt.exb.len > 0) { if (nopt.exb.reach_end) { for (i = 2; i <= qn->lower && ! is_full_opt_exact_info(&opt->exb); i++) { concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc); } if (i < qn->lower) { opt->exb.reach_end = 0; } } } if (qn->lower != qn->upper) { opt->exb.reach_end = 0; opt->exm.reach_end = 0; } if (qn->lower > 1) opt->exm.reach_end = 0; } } min = distance_multiply(nopt.len.min, qn->lower); if (IS_REPEAT_INFINITE(qn->upper)) max = (nopt.len.max > 0 ? ONIG_INFINITE_DISTANCE : 0); else max = distance_multiply(nopt.len.max, qn->upper); set_mml(&opt->len, min, max); } break; case NT_ENCLOSE: { EncloseNode* en = NENCLOSE(node); switch (en->type) { case ENCLOSE_OPTION: { OnigOptionType save = env->options; env->options = en->option; r = optimize_node_left(en->target, opt, env); env->options = save; } break; case ENCLOSE_MEMORY: #ifdef USE_SUBEXP_CALL en->opt_count++; if (en->opt_count > MAX_NODE_OPT_INFO_REF_COUNT) { OnigDistance min, max; min = 0; max = ONIG_INFINITE_DISTANCE; if (IS_ENCLOSE_MIN_FIXED(en)) min = en->min_len; if (IS_ENCLOSE_MAX_FIXED(en)) max = en->max_len; set_mml(&opt->len, min, max); } else #endif { r = optimize_node_left(en->target, opt, env); if (is_set_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK)) { if (BIT_STATUS_AT(env->scan_env->backrefed_mem, en->regnum)) remove_opt_anc_info(&opt->anc, ANCHOR_ANYCHAR_STAR_MASK); } } break; case ENCLOSE_STOP_BACKTRACK: case ENCLOSE_CONDITION: r = optimize_node_left(en->target, opt, env); break; } } break; default: #ifdef ONIG_DEBUG fprintf(stderr, "optimize_node_left: undefined node type %d\n", NTYPE(node)); #endif r = ONIGERR_TYPE_BUG; break; } return r; } static int set_optimize_exact_info(regex_t* reg, OptExactInfo* e) { int r; int allow_reverse; if (e->len == 0) return 0; reg->exact = (UChar* )xmalloc(e->len); CHECK_NULL_RETURN_MEMERR(reg->exact); xmemcpy(reg->exact, e->s, e->len); reg->exact_end = reg->exact + e->len; allow_reverse = ONIGENC_IS_ALLOWED_REVERSE_MATCH(reg->enc, reg->exact, reg->exact_end); if (e->ignore_case > 0) { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { r = set_bm_skip(reg->exact, reg->exact_end, reg, reg->map, &(reg->int_map), 1); if (r == 0) { reg->optimize = (allow_reverse != 0 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC); } else { reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } } else { reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } } else { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { r = set_bm_skip(reg->exact, reg->exact_end, reg, reg->map, &(reg->int_map), 0); if (r == 0) { reg->optimize = (allow_reverse != 0 ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); } else { reg->optimize = ONIG_OPTIMIZE_EXACT; } } else { reg->optimize = ONIG_OPTIMIZE_EXACT; } } reg->dmin = e->mmd.min; reg->dmax = e->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { reg->threshold_len = (int )(reg->dmin + (reg->exact_end - reg->exact)); } return 0; } static void set_optimize_map_info(regex_t* reg, OptMapInfo* m) { int i; for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) reg->map[i] = m->map[i]; reg->optimize = ONIG_OPTIMIZE_MAP; reg->dmin = m->mmd.min; reg->dmax = m->mmd.max; if (reg->dmin != ONIG_INFINITE_DISTANCE) { reg->threshold_len = (int )(reg->dmin + 1); } } static void set_sub_anchor(regex_t* reg, OptAncInfo* anc) { reg->sub_anchor |= anc->left_anchor & ANCHOR_BEGIN_LINE; reg->sub_anchor |= anc->right_anchor & ANCHOR_END_LINE; } #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) static void print_optimize_info(FILE* f, regex_t* reg); #endif static int set_optimize_info_from_tree(Node* node, regex_t* reg, ScanEnv* scan_env) { int r; NodeOptInfo opt; OptEnv env; env.enc = reg->enc; env.options = reg->options; env.case_fold_flag = reg->case_fold_flag; env.scan_env = scan_env; clear_mml(&env.mmd); r = optimize_node_left(node, &opt, &env); if (r) return r; reg->anchor = opt.anc.left_anchor & (ANCHOR_BEGIN_BUF | ANCHOR_BEGIN_POSITION | ANCHOR_ANYCHAR_STAR | ANCHOR_ANYCHAR_STAR_ML | ANCHOR_LOOK_BEHIND); if ((opt.anc.left_anchor & (ANCHOR_LOOK_BEHIND | ANCHOR_PREC_READ_NOT)) != 0) reg->anchor &= ~ANCHOR_ANYCHAR_STAR_ML; reg->anchor |= opt.anc.right_anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF | ANCHOR_PREC_READ_NOT); if (reg->anchor & (ANCHOR_END_BUF | ANCHOR_SEMI_END_BUF)) { reg->anchor_dmin = opt.len.min; reg->anchor_dmax = opt.len.max; } if (opt.exb.len > 0 || opt.exm.len > 0) { select_opt_exact_info(reg->enc, &opt.exb, &opt.exm); if (opt.map.value > 0 && comp_opt_exact_or_map_info(&opt.exb, &opt.map) > 0) { goto set_map; } else { r = set_optimize_exact_info(reg, &opt.exb); set_sub_anchor(reg, &opt.exb.anc); } } else if (opt.map.value > 0) { set_map: set_optimize_map_info(reg, &opt.map); set_sub_anchor(reg, &opt.map.anc); } else { reg->sub_anchor |= opt.anc.left_anchor & ANCHOR_BEGIN_LINE; if (opt.len.max == 0) reg->sub_anchor |= opt.anc.right_anchor & ANCHOR_END_LINE; } #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) print_optimize_info(stderr, reg); #endif return r; } static void clear_optimize_info(regex_t* reg) { reg->optimize = ONIG_OPTIMIZE_NONE; reg->anchor = 0; reg->anchor_dmin = 0; reg->anchor_dmax = 0; reg->sub_anchor = 0; reg->exact_end = (UChar* )NULL; reg->threshold_len = 0; if (IS_NOT_NULL(reg->exact)) { xfree(reg->exact); reg->exact = (UChar* )NULL; } } #ifdef ONIG_DEBUG static void print_enc_string(FILE* fp, OnigEncoding enc, const UChar *s, const UChar *end) { fprintf(fp, "\nPATTERN: /"); if (ONIGENC_MBC_MINLEN(enc) > 1) { const UChar *p; OnigCodePoint code; p = s; while (p < end) { code = ONIGENC_MBC_TO_CODE(enc, p, end); if (code >= 0x80) { fprintf(fp, " 0x%04x ", (int )code); } else { fputc((int )code, fp); } p += enclen(enc, p, end); } } else { while (s < end) { fputc((int )*s, fp); s++; } } fprintf(fp, "/ (%s)\n", enc->name); } #endif /* ONIG_DEBUG */ #if defined(ONIG_DEBUG_COMPILE) || defined(ONIG_DEBUG_MATCH) static void print_distance_range(FILE* f, OnigDistance a, OnigDistance b) { if (a == ONIG_INFINITE_DISTANCE) fputs("inf", f); else fprintf(f, "(%"PRIuPTR")", a); fputs("-", f); if (b == ONIG_INFINITE_DISTANCE) fputs("inf", f); else fprintf(f, "(%"PRIuPTR")", b); } static void print_anchor(FILE* f, int anchor) { int q = 0; fprintf(f, "["); if (anchor & ANCHOR_BEGIN_BUF) { fprintf(f, "begin-buf"); q = 1; } if (anchor & ANCHOR_BEGIN_LINE) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "begin-line"); } if (anchor & ANCHOR_BEGIN_POSITION) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "begin-pos"); } if (anchor & ANCHOR_END_BUF) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "end-buf"); } if (anchor & ANCHOR_SEMI_END_BUF) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "semi-end-buf"); } if (anchor & ANCHOR_END_LINE) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "end-line"); } if (anchor & ANCHOR_ANYCHAR_STAR) { if (q) fprintf(f, ", "); q = 1; fprintf(f, "anychar-star"); } if (anchor & ANCHOR_ANYCHAR_STAR_ML) { if (q) fprintf(f, ", "); fprintf(f, "anychar-star-ml"); } fprintf(f, "]"); } static void print_optimize_info(FILE* f, regex_t* reg) { static const char* on[] = { "NONE", "EXACT", "EXACT_BM", "EXACT_BM_NOT_REV", "EXACT_IC", "MAP", "EXACT_BM_IC", "EXACT_BM_NOT_REV_IC" }; fprintf(f, "optimize: %s\n", on[reg->optimize]); fprintf(f, " anchor: "); print_anchor(f, reg->anchor); if ((reg->anchor & ANCHOR_END_BUF_MASK) != 0) print_distance_range(f, reg->anchor_dmin, reg->anchor_dmax); fprintf(f, "\n"); if (reg->optimize) { fprintf(f, " sub anchor: "); print_anchor(f, reg->sub_anchor); fprintf(f, "\n"); } fprintf(f, "\n"); if (reg->exact) { UChar *p; fprintf(f, "exact: ["); for (p = reg->exact; p < reg->exact_end; p++) { fputc(*p, f); } fprintf(f, "]: length: %"PRIdPTR"\n", (reg->exact_end - reg->exact)); } else if (reg->optimize & ONIG_OPTIMIZE_MAP) { int c, i, n = 0; for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) if (reg->map[i]) n++; fprintf(f, "map: n=%d\n", n); if (n > 0) { c = 0; fputc('[', f); for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) { if (reg->map[i] != 0) { if (c > 0) fputs(", ", f); c++; if (ONIGENC_MBC_MAXLEN(reg->enc) == 1 && ONIGENC_IS_CODE_PRINT(reg->enc, (OnigCodePoint )i)) fputc(i, f); else fprintf(f, "%d", i); } } fprintf(f, "]\n"); } } } #endif /* ONIG_DEBUG_COMPILE || ONIG_DEBUG_MATCH */ extern void onig_free_body(regex_t* reg) { if (IS_NOT_NULL(reg)) { if (IS_NOT_NULL(reg->p)) xfree(reg->p); if (IS_NOT_NULL(reg->exact)) xfree(reg->exact); if (IS_NOT_NULL(reg->int_map)) xfree(reg->int_map); if (IS_NOT_NULL(reg->int_map_backward)) xfree(reg->int_map_backward); if (IS_NOT_NULL(reg->repeat_range)) xfree(reg->repeat_range); if (IS_NOT_NULL(reg->chain)) onig_free(reg->chain); #ifdef USE_NAMED_GROUP onig_names_free(reg); #endif } } extern void onig_free(regex_t* reg) { if (IS_NOT_NULL(reg)) { onig_free_body(reg); xfree(reg); } } #ifdef RUBY size_t onig_memsize(const regex_t *reg) { size_t size = sizeof(regex_t); if (IS_NULL(reg)) return 0; if (IS_NOT_NULL(reg->p)) size += reg->alloc; if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact; if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE; if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE; if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange); if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain); return size; } size_t onig_region_memsize(const OnigRegion *regs) { size_t size = sizeof(*regs); if (IS_NULL(regs)) return 0; size += regs->allocated * (sizeof(*regs->beg) + sizeof(*regs->end)); return size; } #endif #define REGEX_TRANSFER(to,from) do {\ onig_free_body(to);\ xmemcpy(to, from, sizeof(regex_t));\ xfree(from);\ } while (0) #if 0 extern void onig_transfer(regex_t* to, regex_t* from) { REGEX_TRANSFER(to, from); } #endif #ifdef ONIG_DEBUG_COMPILE static void print_compiled_byte_code_list(FILE* f, regex_t* reg); #endif #ifdef ONIG_DEBUG_PARSE_TREE static void print_tree(FILE* f, Node* node); #endif #ifdef RUBY extern int onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo) { return onig_compile_ruby(reg, pattern, pattern_end, einfo, NULL, 0); } #endif #ifdef RUBY extern int onig_compile_ruby(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo, const char *sourcefile, int sourceline) #else extern int onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigErrorInfo* einfo) #endif { #define COMPILE_INIT_SIZE 20 int r; OnigDistance init_size; Node* root; ScanEnv scan_env = {0}; #ifdef USE_SUBEXP_CALL UnsetAddrList uslist; #endif if (IS_NOT_NULL(einfo)) einfo->par = (UChar* )NULL; #ifdef RUBY scan_env.sourcefile = sourcefile; scan_env.sourceline = sourceline; #endif #ifdef ONIG_DEBUG print_enc_string(stderr, reg->enc, pattern, pattern_end); #endif if (reg->alloc == 0) { init_size = (pattern_end - pattern) * 2; if (init_size <= 0) init_size = COMPILE_INIT_SIZE; r = BBUF_INIT(reg, init_size); if (r != 0) goto end; } else reg->used = 0; reg->num_mem = 0; reg->num_repeat = 0; reg->num_null_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (OnigRepeatRange* )NULL; #ifdef USE_COMBINATION_EXPLOSION_CHECK reg->num_comb_exp_check = 0; #endif r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; #ifdef ONIG_DEBUG_PARSE_TREE # if 0 fprintf(stderr, "ORIGINAL PARSE TREE:\n"); print_tree(stderr, root); # endif #endif #ifdef USE_NAMED_GROUP /* mixed use named group and no-named group */ if (scan_env.num_named > 0 && IS_SYNTAX_BV(scan_env.syntax, ONIG_SYN_CAPTURE_ONLY_NAMED_GROUP) && !ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_CAPTURE_GROUP)) { if (scan_env.num_named != scan_env.num_mem) r = disable_noname_group_capture(&root, reg, &scan_env); else r = numbered_ref_check(root); if (r != 0) goto err; } #endif #ifdef USE_SUBEXP_CALL if (scan_env.num_call > 0) { r = unset_addr_list_init(&uslist, scan_env.num_call); if (r != 0) goto err; scan_env.unset_addr_list = &uslist; r = setup_subexp_call(root, &scan_env); if (r != 0) goto err_unset; r = subexp_recursive_check_trav(root, &scan_env); if (r < 0) goto err_unset; r = subexp_inf_recursive_check_trav(root, &scan_env); if (r != 0) goto err_unset; reg->num_call = scan_env.num_call; } else reg->num_call = 0; #endif r = setup_tree(root, reg, IN_ROOT, &scan_env); if (r != 0) goto err_unset; #ifdef ONIG_DEBUG_PARSE_TREE print_tree(stderr, root); #endif reg->capture_history = scan_env.capture_history; reg->bt_mem_start = scan_env.bt_mem_start; reg->bt_mem_start |= reg->capture_history; if (IS_FIND_CONDITION(reg->options)) BIT_STATUS_ON_ALL(reg->bt_mem_end); else { reg->bt_mem_end = scan_env.bt_mem_end; reg->bt_mem_end |= reg->capture_history; } #ifdef USE_COMBINATION_EXPLOSION_CHECK if (scan_env.backrefed_mem == 0 # ifdef USE_SUBEXP_CALL || scan_env.num_call == 0 # endif ) { setup_comb_exp_check(root, 0, &scan_env); # ifdef USE_SUBEXP_CALL if (scan_env.has_recursion != 0) { scan_env.num_comb_exp_check = 0; } else # endif if (scan_env.comb_exp_max_regnum > 0) { int i; for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) { if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) { scan_env.num_comb_exp_check = 0; break; } } } } reg->num_comb_exp_check = scan_env.num_comb_exp_check; #endif clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); if (r != 0) goto err_unset; #endif if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) { xfree(scan_env.mem_nodes_dynamic); scan_env.mem_nodes_dynamic = (Node** )NULL; } r = compile_tree(root, reg); if (r == 0) { r = add_opcode(reg, OP_END); #ifdef USE_SUBEXP_CALL if (scan_env.num_call > 0) { r = unset_addr_list_fix(&uslist, reg); unset_addr_list_end(&uslist); if (r) goto err; } #endif if ((reg->num_repeat != 0) || (reg->bt_mem_end != 0)) reg->stack_pop_level = STACK_POP_LEVEL_ALL; else { if (reg->bt_mem_start != 0) reg->stack_pop_level = STACK_POP_LEVEL_MEM_START; else reg->stack_pop_level = STACK_POP_LEVEL_FREE; } } #ifdef USE_SUBEXP_CALL else if (scan_env.num_call > 0) { unset_addr_list_end(&uslist); } #endif onig_node_free(root); #ifdef ONIG_DEBUG_COMPILE # ifdef USE_NAMED_GROUP onig_print_names(stderr, reg); # endif print_compiled_byte_code_list(stderr, reg); #endif end: return r; err_unset: #ifdef USE_SUBEXP_CALL if (scan_env.num_call > 0) { unset_addr_list_end(&uslist); } #endif err: if (IS_NOT_NULL(scan_env.error)) { if (IS_NOT_NULL(einfo)) { einfo->enc = scan_env.enc; einfo->par = scan_env.error; einfo->par_end = scan_env.error_end; } } onig_node_free(root); if (IS_NOT_NULL(scan_env.mem_nodes_dynamic)) xfree(scan_env.mem_nodes_dynamic); return r; } static int onig_inited = 0; extern int onig_reg_init(regex_t* reg, OnigOptionType option, OnigCaseFoldType case_fold_flag, OnigEncoding enc, const OnigSyntaxType* syntax) { if (! onig_inited) onig_init(); if (IS_NULL(reg)) return ONIGERR_INVALID_ARGUMENT; if (ONIGENC_IS_UNDEF(enc)) return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET; if ((option & (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) == (ONIG_OPTION_DONT_CAPTURE_GROUP|ONIG_OPTION_CAPTURE_GROUP)) { return ONIGERR_INVALID_COMBINATION_OF_OPTIONS; } if ((option & ONIG_OPTION_NEGATE_SINGLELINE) != 0) { option |= syntax->options; option &= ~ONIG_OPTION_SINGLELINE; } else option |= syntax->options; (reg)->enc = enc; (reg)->options = option; (reg)->syntax = syntax; (reg)->optimize = 0; (reg)->exact = (UChar* )NULL; (reg)->int_map = (int* )NULL; (reg)->int_map_backward = (int* )NULL; (reg)->chain = (regex_t* )NULL; (reg)->p = (UChar* )NULL; (reg)->alloc = 0; (reg)->used = 0; (reg)->name_table = (void* )NULL; (reg)->case_fold_flag = case_fold_flag; return 0; } extern int onig_new_without_alloc(regex_t* reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, OnigSyntaxType* syntax, OnigErrorInfo* einfo) { int r; r = onig_reg_init(reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) return r; r = onig_compile(reg, pattern, pattern_end, einfo); return r; } extern int onig_new(regex_t** reg, const UChar* pattern, const UChar* pattern_end, OnigOptionType option, OnigEncoding enc, const OnigSyntaxType* syntax, OnigErrorInfo* einfo) { int r; *reg = (regex_t* )xmalloc(sizeof(regex_t)); if (IS_NULL(*reg)) return ONIGERR_MEMORY; r = onig_reg_init(*reg, option, ONIGENC_CASE_FOLD_DEFAULT, enc, syntax); if (r) goto err; r = onig_compile(*reg, pattern, pattern_end, einfo); if (r) { err: onig_free(*reg); *reg = NULL; } return r; } extern int onig_initialize(OnigEncoding encodings[] ARG_UNUSED, int n ARG_UNUSED) { return onig_init(); } extern int onig_init(void) { if (onig_inited != 0) return 0; onig_inited = 1; #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); #endif onigenc_init(); /* onigenc_set_default_caseconv_table((UChar* )0); */ #ifdef ONIG_DEBUG_STATISTICS onig_statistics_init(); #endif return 0; } static OnigEndCallListItemType* EndCallTop; extern void onig_add_end_call(void (*func)(void)) { OnigEndCallListItemType* item; item = (OnigEndCallListItemType* )xmalloc(sizeof(*item)); if (item == 0) return ; item->next = EndCallTop; item->func = func; EndCallTop = item; } static void exec_end_call_list(void) { OnigEndCallListItemType* prev; void (*func)(void); while (EndCallTop != 0) { func = EndCallTop->func; (*func)(); prev = EndCallTop; EndCallTop = EndCallTop->next; xfree(prev); } } extern int onig_end(void) { exec_end_call_list(); #ifdef ONIG_DEBUG_STATISTICS onig_print_statistics(stderr); #endif #if defined(ONIG_DEBUG_MEMLEAK) && defined(_MSC_VER) _CrtDumpMemoryLeaks(); #endif onig_inited = 0; return 0; } extern int onig_is_in_code_range(const UChar* p, OnigCodePoint code) { OnigCodePoint n, *data; OnigCodePoint low, high, x; GET_CODE_POINT(n, p); data = (OnigCodePoint* )p; data++; for (low = 0, high = n; low < high; ) { x = (low + high) >> 1; if (code > data[x * 2 + 1]) low = x + 1; else high = x; } return ((low < n && code >= data[low * 2]) ? 1 : 0); } extern int onig_is_code_in_cc_len(int elen, OnigCodePoint code, CClassNode* cc) { int found; if (elen > 1 || (code >= SINGLE_BYTE_SIZE)) { if (IS_NULL(cc->mbuf)) { found = 0; } else { found = (onig_is_in_code_range(cc->mbuf->p, code) != 0 ? 1 : 0); } } else { found = (BITSET_AT(cc->bs, code) == 0 ? 0 : 1); } if (IS_NCCLASS_NOT(cc)) return !found; else return found; } extern int onig_is_code_in_cc(OnigEncoding enc, OnigCodePoint code, CClassNode* cc) { int len; if (ONIGENC_MBC_MINLEN(enc) > 1) { len = 2; } else { len = ONIGENC_CODE_TO_MBCLEN(enc, code); } return onig_is_code_in_cc_len(len, code, cc); } #ifdef ONIG_DEBUG /* arguments type */ # define ARG_SPECIAL -1 # define ARG_NON 0 # define ARG_RELADDR 1 # define ARG_ABSADDR 2 # define ARG_LENGTH 3 # define ARG_MEMNUM 4 # define ARG_OPTION 5 # define ARG_STATE_CHECK 6 OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, { OP_END, "end", ARG_NON }, { OP_EXACT1, "exact1", ARG_SPECIAL }, { OP_EXACT2, "exact2", ARG_SPECIAL }, { OP_EXACT3, "exact3", ARG_SPECIAL }, { OP_EXACT4, "exact4", ARG_SPECIAL }, { OP_EXACT5, "exact5", ARG_SPECIAL }, { OP_EXACTN, "exactn", ARG_SPECIAL }, { OP_EXACTMB2N1, "exactmb2-n1", ARG_SPECIAL }, { OP_EXACTMB2N2, "exactmb2-n2", ARG_SPECIAL }, { OP_EXACTMB2N3, "exactmb2-n3", ARG_SPECIAL }, { OP_EXACTMB2N, "exactmb2-n", ARG_SPECIAL }, { OP_EXACTMB3N, "exactmb3n" , ARG_SPECIAL }, { OP_EXACTMBN, "exactmbn", ARG_SPECIAL }, { OP_EXACT1_IC, "exact1-ic", ARG_SPECIAL }, { OP_EXACTN_IC, "exactn-ic", ARG_SPECIAL }, { OP_CCLASS, "cclass", ARG_SPECIAL }, { OP_CCLASS_MB, "cclass-mb", ARG_SPECIAL }, { OP_CCLASS_MIX, "cclass-mix", ARG_SPECIAL }, { OP_CCLASS_NOT, "cclass-not", ARG_SPECIAL }, { OP_CCLASS_MB_NOT, "cclass-mb-not", ARG_SPECIAL }, { OP_CCLASS_MIX_NOT, "cclass-mix-not", ARG_SPECIAL }, { OP_ANYCHAR, "anychar", ARG_NON }, { OP_ANYCHAR_ML, "anychar-ml", ARG_NON }, { OP_ANYCHAR_STAR, "anychar*", ARG_NON }, { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON }, { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, { OP_WORD, "word", ARG_NON }, { OP_NOT_WORD, "not-word", ARG_NON }, { OP_WORD_BOUND, "word-bound", ARG_NON }, { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, { OP_WORD_BEGIN, "word-begin", ARG_NON }, { OP_WORD_END, "word-end", ARG_NON }, { OP_ASCII_WORD, "ascii-word", ARG_NON }, { OP_NOT_ASCII_WORD, "not-ascii-word", ARG_NON }, { OP_ASCII_WORD_BOUND, "ascii-word-bound", ARG_NON }, { OP_NOT_ASCII_WORD_BOUND,"not-ascii-word-bound", ARG_NON }, { OP_ASCII_WORD_BEGIN, "ascii-word-begin", ARG_NON }, { OP_ASCII_WORD_END, "ascii-word-end", ARG_NON }, { OP_BEGIN_BUF, "begin-buf", ARG_NON }, { OP_END_BUF, "end-buf", ARG_NON }, { OP_BEGIN_LINE, "begin-line", ARG_NON }, { OP_END_LINE, "end-line", ARG_NON }, { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, { OP_BEGIN_POSITION, "begin-position", ARG_NON }, { OP_BEGIN_POS_OR_LINE, "begin-pos-or-line", ARG_NON }, { OP_BACKREF1, "backref1", ARG_NON }, { OP_BACKREF2, "backref2", ARG_NON }, { OP_BACKREFN, "backrefn", ARG_MEMNUM }, { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, { OP_BACKREF_WITH_LEVEL, "backref_at_level", ARG_SPECIAL }, { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, { OP_SET_OPTION, "set-option", ARG_OPTION }, { OP_KEEP, "keep", ARG_NON }, { OP_FAIL, "fail", ARG_NON }, { OP_JUMP, "jump", ARG_RELADDR }, { OP_PUSH, "push", ARG_RELADDR }, { OP_POP, "pop", ARG_NON }, { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, { OP_REPEAT, "repeat", ARG_SPECIAL }, { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM }, { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, { OP_PUSH_POS, "push-pos", ARG_NON }, { OP_POP_POS, "pop-pos", ARG_NON }, { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, { OP_FAIL_POS, "fail-pos", ARG_NON }, { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, { OP_CALL, "call", ARG_ABSADDR }, { OP_RETURN, "return", ARG_NON }, { OP_CONDITION, "condition", ARG_SPECIAL }, { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL }, { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL }, { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK }, { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK }, { OP_STATE_CHECK_ANYCHAR_ML_STAR, "state-check-anychar-ml*", ARG_STATE_CHECK }, { -1, "", ARG_NON } }; static const char* op2name(int opcode) { int i; for (i = 0; OnigOpInfo[i].opcode >= 0; i++) { if (opcode == OnigOpInfo[i].opcode) return OnigOpInfo[i].name; } return ""; } static int op2arg_type(int opcode) { int i; for (i = 0; OnigOpInfo[i].opcode >= 0; i++) { if (opcode == OnigOpInfo[i].opcode) return OnigOpInfo[i].arg_type; } return ARG_SPECIAL; } # ifdef ONIG_DEBUG_PARSE_TREE static void Indent(FILE* f, int indent) { int i; for (i = 0; i < indent; i++) putc(' ', f); } # endif /* ONIG_DEBUG_PARSE_TREE */ static void p_string(FILE* f, ptrdiff_t len, UChar* s) { fputs(":", f); while (len-- > 0) { fputc(*s++, f); } } static void p_len_string(FILE* f, LengthType len, int mb_len, UChar* s) { int x = len * mb_len; fprintf(f, ":%d:", len); while (x-- > 0) { fputc(*s++, f); } } extern void onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar* bpend, UChar** nextp, OnigEncoding enc) { int i, n, arg_type; RelAddrType addr; LengthType len; MemNumType mem; StateCheckNumType scn; OnigCodePoint code; UChar *q; fprintf(f, "[%s", op2name(*bp)); arg_type = op2arg_type(*bp); if (arg_type != ARG_SPECIAL) { bp++; switch (arg_type) { case ARG_NON: break; case ARG_RELADDR: GET_RELADDR_INC(addr, bp); fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); break; case ARG_ABSADDR: GET_ABSADDR_INC(addr, bp); fprintf(f, ":(%d)", addr); break; case ARG_LENGTH: GET_LENGTH_INC(len, bp); fprintf(f, ":%d", len); break; case ARG_MEMNUM: mem = *((MemNumType* )bp); bp += SIZE_MEMNUM; fprintf(f, ":%d", mem); break; case ARG_OPTION: { OnigOptionType option = *((OnigOptionType* )bp); bp += SIZE_OPTION; fprintf(f, ":%d", option); } break; case ARG_STATE_CHECK: scn = *((StateCheckNumType* )bp); bp += SIZE_STATE_CHECK_NUM; fprintf(f, ":%d", scn); break; } } else { switch (*bp++) { case OP_EXACT1: case OP_ANYCHAR_STAR_PEEK_NEXT: case OP_ANYCHAR_ML_STAR_PEEK_NEXT: p_string(f, 1, bp++); break; case OP_EXACT2: p_string(f, 2, bp); bp += 2; break; case OP_EXACT3: p_string(f, 3, bp); bp += 3; break; case OP_EXACT4: p_string(f, 4, bp); bp += 4; break; case OP_EXACT5: p_string(f, 5, bp); bp += 5; break; case OP_EXACTN: GET_LENGTH_INC(len, bp); p_len_string(f, len, 1, bp); bp += len; break; case OP_EXACTMB2N1: p_string(f, 2, bp); bp += 2; break; case OP_EXACTMB2N2: p_string(f, 4, bp); bp += 4; break; case OP_EXACTMB2N3: p_string(f, 6, bp); bp += 6; break; case OP_EXACTMB2N: GET_LENGTH_INC(len, bp); p_len_string(f, len, 2, bp); bp += len * 2; break; case OP_EXACTMB3N: GET_LENGTH_INC(len, bp); p_len_string(f, len, 3, bp); bp += len * 3; break; case OP_EXACTMBN: { int mb_len; GET_LENGTH_INC(mb_len, bp); GET_LENGTH_INC(len, bp); fprintf(f, ":%d:%d:", mb_len, len); n = len * mb_len; while (n-- > 0) { fputc(*bp++, f); } } break; case OP_EXACT1_IC: len = enclen(enc, bp, bpend); p_string(f, len, bp); bp += len; break; case OP_EXACTN_IC: GET_LENGTH_INC(len, bp); p_len_string(f, len, 1, bp); bp += len; break; case OP_CCLASS: n = bitset_on_num((BitSetRef )bp); bp += SIZE_BITSET; fprintf(f, ":%d", n); break; case OP_CCLASS_NOT: n = bitset_on_num((BitSetRef )bp); bp += SIZE_BITSET; fprintf(f, ":%d", n); break; case OP_CCLASS_MB: case OP_CCLASS_MB_NOT: GET_LENGTH_INC(len, bp); q = bp; # ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); # endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d", (int )code, len); break; case OP_CCLASS_MIX: case OP_CCLASS_MIX_NOT: n = bitset_on_num((BitSetRef )bp); bp += SIZE_BITSET; GET_LENGTH_INC(len, bp); q = bp; # ifndef PLATFORM_UNALIGNED_WORD_ACCESS ALIGNMENT_RIGHT(q); # endif GET_CODE_POINT(code, q); bp += len; fprintf(f, ":%d:%d:%d", n, (int )code, len); break; case OP_BACKREFN_IC: mem = *((MemNumType* )bp); bp += SIZE_MEMNUM; fprintf(f, ":%d", mem); break; case OP_BACKREF_MULTI_IC: case OP_BACKREF_MULTI: fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { GET_MEMNUM_INC(mem, bp); if (i > 0) fputs(", ", f); fprintf(f, "%d", mem); } break; case OP_BACKREF_WITH_LEVEL: { OnigOptionType option; LengthType level; GET_OPTION_INC(option, bp); fprintf(f, ":%d", option); GET_LENGTH_INC(level, bp); fprintf(f, ":%d", level); fputs(" ", f); GET_LENGTH_INC(len, bp); for (i = 0; i < len; i++) { GET_MEMNUM_INC(mem, bp); if (i > 0) fputs(", ", f); fprintf(f, "%d", mem); } } break; case OP_REPEAT: case OP_REPEAT_NG: { mem = *((MemNumType* )bp); bp += SIZE_MEMNUM; addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; fprintf(f, ":%d:%d", mem, addr); } break; case OP_PUSH_OR_JUMP_EXACT1: case OP_PUSH_IF_PEEK_NEXT: addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; fprintf(f, ":(%s%d)", (addr >= 0) ? "+" : "", addr); p_string(f, 1, bp); bp += 1; break; case OP_LOOK_BEHIND: GET_LENGTH_INC(len, bp); fprintf(f, ":%d", len); break; case OP_PUSH_LOOK_BEHIND_NOT: GET_RELADDR_INC(addr, bp); GET_LENGTH_INC(len, bp); fprintf(f, ":%d:(%s%d)", len, (addr >= 0) ? "+" : "", addr); break; case OP_STATE_CHECK_PUSH: case OP_STATE_CHECK_PUSH_OR_JUMP: scn = *((StateCheckNumType* )bp); bp += SIZE_STATE_CHECK_NUM; addr = *((RelAddrType* )bp); bp += SIZE_RELADDR; fprintf(f, ":%d:(%s%d)", scn, (addr >= 0) ? "+" : "", addr); break; case OP_CONDITION: GET_MEMNUM_INC(mem, bp); GET_RELADDR_INC(addr, bp); fprintf(f, ":%d:(%s%d)", mem, (addr >= 0) ? "+" : "", addr); break; default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", *--bp); } } fputs("]", f); if (nextp) *nextp = bp; } # ifdef ONIG_DEBUG_COMPILE static void print_compiled_byte_code_list(FILE* f, regex_t* reg) { int ncode; UChar* bp = reg->p; UChar* end = reg->p + reg->used; fprintf(f, "code length: %d", reg->used); ncode = -1; while (bp < end) { ncode++; if (ncode % 5 == 0) fprintf(f, "\n%ld:", bp - reg->p); else fprintf(f, " %ld:", bp - reg->p); onig_print_compiled_byte_code(f, bp, end, &bp, reg->enc); } fprintf(f, "\n"); } # endif /* ONIG_DEBUG_COMPILE */ # ifdef ONIG_DEBUG_PARSE_TREE void print_indent_tree(FILE* f, Node* node, int indent) { int i, type, container_p = 0; int add = 3; UChar* p; Indent(f, indent); if (IS_NULL(node)) { fprintf(f, "ERROR: null node!!!\n"); exit (0); } type = NTYPE(node); switch (type) { case NT_LIST: case NT_ALT: if (NTYPE(node) == NT_LIST) fprintf(f, "\n", (intptr_t )node); else fprintf(f, "\n", (intptr_t )node); print_indent_tree(f, NCAR(node), indent + add); while (IS_NOT_NULL(node = NCDR(node))) { if (NTYPE(node) != type) { fprintf(f, "ERROR: list/alt right is not a cons. %d\n", NTYPE(node)); exit(0); } print_indent_tree(f, NCAR(node), indent + add); } break; case NT_STR: fprintf(f, "", (NSTRING_IS_RAW(node) ? "-raw" : ""), (intptr_t )node); for (p = NSTR(node)->s; p < NSTR(node)->end; p++) { if (*p >= 0x20 && *p < 0x7f) fputc(*p, f); else { fprintf(f, " 0x%02x", *p); } } break; case NT_CCLASS: fprintf(f, "", (intptr_t )node); if (IS_NCCLASS_NOT(NCCLASS(node))) fputs("not ", f); if (NCCLASS(node)->mbuf) { BBuf* bbuf = NCCLASS(node)->mbuf; OnigCodePoint* data = (OnigCodePoint* )bbuf->p; OnigCodePoint* end = (OnigCodePoint* )(bbuf->p + bbuf->used); fprintf(f, "%d", *data++); for (; data < end; data+=2) { fprintf(f, ","); fprintf(f, "%04x-%04x", data[0], data[1]); } } break; case NT_CTYPE: fprintf(f, " ", (intptr_t )node); switch (NCTYPE(node)->ctype) { case ONIGENC_CTYPE_WORD: if (NCTYPE(node)->not != 0) fputs("not word", f); else fputs("word", f); break; default: fprintf(f, "ERROR: undefined ctype.\n"); exit(0); } break; case NT_CANY: fprintf(f, "", (intptr_t )node); break; case NT_ANCHOR: fprintf(f, " ", (intptr_t )node); switch (NANCHOR(node)->type) { case ANCHOR_BEGIN_BUF: fputs("begin buf", f); break; case ANCHOR_END_BUF: fputs("end buf", f); break; case ANCHOR_BEGIN_LINE: fputs("begin line", f); break; case ANCHOR_END_LINE: fputs("end line", f); break; case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break; case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break; case ANCHOR_ANYCHAR_STAR: fputs("begin position/line", f); break; case ANCHOR_WORD_BOUND: fputs("word bound", f); break; case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break; # ifdef USE_WORD_BEGIN_END case ANCHOR_WORD_BEGIN: fputs("word begin", f); break; case ANCHOR_WORD_END: fputs("word end", f); break; # endif case ANCHOR_PREC_READ: fputs("prec read", f); container_p = TRUE; break; case ANCHOR_PREC_READ_NOT: fputs("prec read not", f); container_p = TRUE; break; case ANCHOR_LOOK_BEHIND: fputs("look_behind", f); container_p = TRUE; break; case ANCHOR_LOOK_BEHIND_NOT: fputs("look_behind_not",f); container_p = TRUE; break; case ANCHOR_KEEP: fputs("keep",f); break; default: fprintf(f, "ERROR: undefined anchor type.\n"); break; } break; case NT_BREF: { int* p; BRefNode* br = NBREF(node); p = BACKREFS_P(br); fprintf(f, "", (intptr_t )node); for (i = 0; i < br->back_num; i++) { if (i > 0) fputs(", ", f); fprintf(f, "%d", p[i]); } } break; # ifdef USE_SUBEXP_CALL case NT_CALL: { CallNode* cn = NCALL(node); fprintf(f, "", (intptr_t )node); p_string(f, cn->name_end - cn->name, cn->name); } break; # endif case NT_QTFR: fprintf(f, "{%d,%d}%s\n", (intptr_t )node, NQTFR(node)->lower, NQTFR(node)->upper, (NQTFR(node)->greedy ? "" : "?")); print_indent_tree(f, NQTFR(node)->target, indent + add); break; case NT_ENCLOSE: fprintf(f, " ", (intptr_t )node); switch (NENCLOSE(node)->type) { case ENCLOSE_OPTION: fprintf(f, "option:%d", NENCLOSE(node)->option); break; case ENCLOSE_MEMORY: fprintf(f, "memory:%d", NENCLOSE(node)->regnum); break; case ENCLOSE_STOP_BACKTRACK: fprintf(f, "stop-bt"); break; case ENCLOSE_CONDITION: fprintf(f, "condition:%d", NENCLOSE(node)->regnum); break; default: break; } fprintf(f, "\n"); print_indent_tree(f, NENCLOSE(node)->target, indent + add); break; default: fprintf(f, "print_indent_tree: undefined node type %d\n", NTYPE(node)); break; } if (type != NT_LIST && type != NT_ALT && type != NT_QTFR && type != NT_ENCLOSE) fprintf(f, "\n"); if (container_p) print_indent_tree(f, NANCHOR(node)->target, indent + add); fflush(f); } static void print_tree(FILE* f, Node* node) { print_indent_tree(f, node, 0); } # endif /* ONIG_DEBUG_PARSE_TREE */ #endif /* ONIG_DEBUG */