diff options
Diffstat (limited to 'regcomp.c')
| -rw-r--r-- | regcomp.c | 285 |
1 files changed, 91 insertions, 194 deletions
@@ -2,8 +2,8 @@ regcomp.c - Onigmo (Oniguruma-mod) (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> - * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp> + * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp> + * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -394,6 +394,7 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info) r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */ if (r) return r; reg->num_null_check++; + if ((MemNumType)reg->num_null_check <= 0) return ONIGERR_TOO_MANY_NULL_CHECK; } r = compile_tree(node, reg); @@ -703,6 +704,7 @@ compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info, if (r) return r; r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */ reg->num_repeat++; + if ((MemNumType)reg->num_repeat <= 0) return ONIGERR_TOO_MANY_RANGE_REPEAT; if (r) return r; r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC); if (r) return r; @@ -2803,14 +2805,11 @@ get_head_value_node(Node* node, int exact, regex_t* reg) case NT_STR: { StrNode* sn = NSTR(node); - if (sn->end <= sn->s) break; - if (exact != 0 && - !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) { - } - else { + if (exact == 0 || + NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) { n = node; } } @@ -3301,6 +3300,14 @@ setup_subexp_call(Node* node, ScanEnv* env) } #endif +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) +#define IN_CALL (1<<4) +#define IN_RECCALL (1<<5) +#define IN_LOOK_BEHIND (1<<6) + /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) (?<!A|B) ==> (?<!A)(?<!B) @@ -3597,24 +3604,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], return ONIGERR_MEMORY; } -static int -expand_case_fold_string(Node* node, regex_t* reg) -{ #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 +static int +expand_case_fold_string(Node* node, regex_t* reg, int state) +{ int r, n, len, alt_num; int varlen = 0; + int is_in_look_behind; UChar *start, *end, *p; Node *top_root, *root, *snode, *prev_node; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - StrNode* sn = NSTR(node); + StrNode* sn; if (NSTRING_IS_AMBIG(node)) return 0; + sn = NSTR(node); + start = sn->s; end = sn->end; if (start >= end) return 0; + is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + r = 0; top_root = root = prev_node = snode = NULL_NODE; alt_num = 1; @@ -3630,7 +3642,7 @@ expand_case_fold_string(Node* node, regex_t* reg) len = enclen(reg->enc, p, end); varlen = is_case_fold_variable_len(n, items, len); - if (n == 0 || varlen == 0) { + if (n == 0 || varlen == 0 || is_in_look_behind) { if (IS_NULL(snode)) { if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { onig_node_free(top_root); @@ -3889,13 +3901,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) } #endif -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) -#define IN_VAR_REPEAT (1<<3) -#define IN_CALL (1<<4) -#define IN_RECCALL (1<<5) - /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) 2. expand ignore-case in char class. @@ -3937,7 +3942,7 @@ restart: case NT_STR: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg); + r = expand_case_fold_string(node, reg, state); } break; @@ -4180,7 +4185,7 @@ restart: if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, state, env); + r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } @@ -4193,7 +4198,8 @@ restart: if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, (state | IN_NOT), env); + r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND), + env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } @@ -4209,169 +4215,73 @@ restart: return r; } -#ifndef USE_SUNDAY_QUICK_SEARCH -/* set skip map for Boyer-Moore search */ +/* set skip map for Sunday's quick search */ static int set_bm_skip(UChar* s, UChar* end, regex_t* reg, - UChar skip[], int** int_skip, int ignore_case) + UChar skip[], int ignore_case) { OnigDistance i, len; int clen, flen, n, j, k; - UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; + UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN]; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; OnigEncoding enc = reg->enc; len = end - s; - if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len; - - n = 0; - for (i = 0; i < len - 1; i += clen) { - p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); - clen = enclen(enc, p, end); - if (p + clen > end) - clen = (int )(end - p); - - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - skip[s[i + j]] = (UChar )(len - 1 - i - j); - for (k = 0; k < n; k++) { - skip[buf[k][j]] = (UChar )(len - 1 - i - j); - } - } - } - } - else { -# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE + if (len >= ONIG_CHAR_TABLE_SIZE) { /* This should not happen. */ return ONIGERR_TYPE_BUG; -# else - if (IS_NULL(*int_skip)) { - *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); - if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; - } - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len; - - n = 0; - for (i = 0; i < len - 1; i += clen) { - p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); - clen = enclen(enc, p, end); - if (p + clen > end) - clen = (int )(end - p); - - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - (*int_skip)[s[i + j]] = (int )(len - 1 - i - j); - for (k = 0; k < n; k++) { - (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j); - } - } - } -# endif } - return 0; -} - -#else /* USE_SUNDAY_QUICK_SEARCH */ - -/* set skip map for Sunday's quick search */ -static int -set_bm_skip(UChar* s, UChar* end, regex_t* reg, - UChar skip[], int** int_skip, int ignore_case) -{ - OnigDistance i, len; - int clen, flen, n, j, k; - UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN]; - OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - OnigEncoding enc = reg->enc; - len = end - s; - if (len < ONIG_CHAR_TABLE_SIZE) { - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1); - - n = 0; + if (ignore_case) { for (i = 0; i < len; i += clen) { p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); clen = enclen(enc, p, end); if (p + clen > end) clen = (int )(end - p); for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - skip[s[i + j]] = (UChar )(len - i - j); - for (k = 0; k < n; k++) { - skip[buf[k][j]] = (UChar )(len - i - j); - } + if ((items[j].code_len != 1) || (items[j].byte_len != clen)) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } + flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf); + if (flen != clen) { + /* Different length isn't supported. Stop optimization at here. */ + end = p; + goto endcheck; + } } } +endcheck: + len = end - s; } - else { -# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE - /* This should not happen. */ - return ONIGERR_TYPE_BUG; -# else - if (IS_NULL(*int_skip)) { - *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE); - if (IS_NULL(*int_skip)) return ONIGERR_MEMORY; - } - for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1); - - n = 0; - for (i = 0; i < len; i += clen) { - p = s + i; - if (ignore_case) - n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, - p, end, items); - clen = enclen(enc, p, end); - if (p + clen > end) - clen = (int )(end - p); - for (j = 0; j < n; j++) { - if ((items[j].code_len != 1) || (items[j].byte_len != clen)) - return 1; /* different length isn't supported. */ - flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]); - if (flen != clen) - return 1; /* different length isn't supported. */ - } - for (j = 0; j < clen; j++) { - (*int_skip)[s[i + j]] = (int )(len - i - j); - for (k = 0; k < n; k++) { - (*int_skip)[buf[k][j]] = (int )(len - i - j); - } + for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) + skip[i] = (UChar )(len + 1); + n = 0; + for (i = 0; i < len; i += clen) { + p = s + i; + if (ignore_case) + n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag, + p, end, items); + clen = enclen(enc, p, end); + if (p + clen > end) + clen = (int )(end - p); + + for (j = 0; j < clen; j++) { + skip[s[i + j]] = (UChar )(len - i - j); + for (k = 0; k < n; k++) { + ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf); + skip[buf[j]] = (UChar )(len - i - j); } } -# endif } - return 0; + + return (int )len; } -#endif /* USE_SUNDAY_QUICK_SEARCH */ typedef struct { OnigDistance min; /* min byte length */ @@ -5049,7 +4959,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (NSTRING_IS_DONT_GET_OPT_INFO(node)) { int n = onigenc_strlen(env->enc, sn->s, sn->end); - max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n; + max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n; } else { concat_opt_exact_info_str(&opt->exb, sn->s, sn->end, @@ -5232,7 +5142,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) r = optimize_node_left(qn->target, &nopt, env); if (r) break; - if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) { + if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) { if (env->mmd.max == 0 && NTYPE(qn->target) == NT_CANY && qn->greedy) { if (IS_MULTILINE(env->options)) @@ -5342,7 +5252,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) static int set_optimize_exact_info(regex_t* reg, OptExactInfo* e) { - int r; int allow_reverse; if (e->len == 0) return 0; @@ -5357,13 +5266,22 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) if (e->ignore_case > 0) { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg, - reg->map, &(reg->int_map), 1); - if (r == 0) { + int orig_len = e->len; + e->len = set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, 1); + if (e->len >= 3) { + reg->exact_end = reg->exact + e->len; reg->optimize = (allow_reverse != 0 ? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC); } else { + /* Even if BM skip table can't be built (e.g., pattern starts with + 's' or 'k' which have multi-byte case fold variants), we should + still use EXACT_IC optimization with the original pattern. + Without this fallback, patterns like /slackware/i have no + optimization at all, causing severe performance regression + especially with non-ASCII strings. See [Bug #21824] */ + e->len = orig_len; /* Restore original length for EXACT_IC */ reg->optimize = ONIG_OPTIMIZE_EXACT_IC; } } @@ -5373,15 +5291,10 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e) } else { if (e->len >= 3 || (e->len >= 2 && allow_reverse)) { - r = set_bm_skip(reg->exact, reg->exact_end, reg, - reg->map, &(reg->int_map), 0); - if (r == 0) { - reg->optimize = (allow_reverse != 0 - ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); - } - else { - reg->optimize = ONIG_OPTIMIZE_EXACT; - } + set_bm_skip(reg->exact, reg->exact_end, reg, + reg->map, 0); + reg->optimize = (allow_reverse != 0 + ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV); } else { reg->optimize = ONIG_OPTIMIZE_EXACT; @@ -5662,8 +5575,6 @@ onig_free_body(regex_t* reg) if (IS_NOT_NULL(reg)) { xfree(reg->p); xfree(reg->exact); - xfree(reg->int_map); - xfree(reg->int_map_backward); xfree(reg->repeat_range); onig_free(reg->chain); @@ -5710,14 +5621,6 @@ onig_reg_copy(regex_t** nreg, regex_t* oreg) (reg)->exact_end = (reg)->exact + exact_size; } - if (IS_NOT_NULL(reg->int_map)) { - if (COPY_FAILED(int_map, sizeof(int) * ONIG_CHAR_TABLE_SIZE)) - goto err_int_map; - } - if (IS_NOT_NULL(reg->int_map_backward)) { - if (COPY_FAILED(int_map_backward, sizeof(int) * ONIG_CHAR_TABLE_SIZE)) - goto err_int_map_backward; - } if (IS_NOT_NULL(reg->p)) { if (COPY_FAILED(p, reg->alloc)) goto err_p; @@ -5744,10 +5647,6 @@ onig_reg_copy(regex_t** nreg, regex_t* oreg) err_repeat_range: xfree(reg->p); err_p: - xfree(reg->int_map_backward); - err_int_map_backward: - xfree(reg->int_map); - err_int_map: xfree(reg->exact); err: xfree(reg); @@ -5764,8 +5663,6 @@ onig_memsize(const regex_t *reg) if (IS_NULL(reg)) return 0; if (IS_NOT_NULL(reg->p)) size += reg->alloc; if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact; - if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE; - if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE; if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange); if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain); @@ -6030,6 +5927,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option, if (IS_NULL(reg)) return ONIGERR_INVALID_ARGUMENT; + (reg)->exact = (UChar* )NULL; + (reg)->chain = (regex_t* )NULL; + (reg)->p = (UChar* )NULL; + (reg)->name_table = (void* )NULL; + (reg)->repeat_range = (OnigRepeatRange* )NULL; + if (ONIGENC_IS_UNDEF(enc)) return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET; @@ -6049,15 +5952,9 @@ onig_reg_init(regex_t* reg, OnigOptionType option, (reg)->options = option; (reg)->syntax = syntax; (reg)->optimize = 0; - (reg)->exact = (UChar* )NULL; - (reg)->int_map = (int* )NULL; - (reg)->int_map_backward = (int* )NULL; - (reg)->chain = (regex_t* )NULL; - (reg)->p = (UChar* )NULL; (reg)->alloc = 0; (reg)->used = 0; - (reg)->name_table = (void* )NULL; (reg)->case_fold_flag = case_fold_flag; |
