summaryrefslogtreecommitdiff
path: root/regcomp.c
diff options
context:
space:
mode:
Diffstat (limited to 'regcomp.c')
-rw-r--r--regcomp.c285
1 files changed, 91 insertions, 194 deletions
diff --git a/regcomp.c b/regcomp.c
index e389e6f120..1895ac24ed 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -2,8 +2,8 @@
regcomp.c - Onigmo (Oniguruma-mod) (regular expression library)
**********************************************************************/
/*-
- * Copyright (c) 2002-2013 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
- * Copyright (c) 2011-2016 K.Takata <kentkt AT csc DOT jp>
+ * Copyright (c) 2002-2018 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2011-2019 K.Takata <kentkt AT csc DOT jp>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -394,6 +394,7 @@ compile_tree_empty_check(Node* node, regex_t* reg, int empty_info)
r = add_mem_num(reg, reg->num_null_check); /* NULL CHECK ID */
if (r) return r;
reg->num_null_check++;
+ if ((MemNumType)reg->num_null_check <= 0) return ONIGERR_TOO_MANY_NULL_CHECK;
}
r = compile_tree(node, reg);
@@ -703,6 +704,7 @@ compile_range_repeat_node(QtfrNode* qn, int target_len, int empty_info,
if (r) return r;
r = add_mem_num(reg, num_repeat); /* OP_REPEAT ID */
reg->num_repeat++;
+ if ((MemNumType)reg->num_repeat <= 0) return ONIGERR_TOO_MANY_RANGE_REPEAT;
if (r) return r;
r = add_rel_addr(reg, target_len + SIZE_OP_REPEAT_INC);
if (r) return r;
@@ -2803,14 +2805,11 @@ get_head_value_node(Node* node, int exact, regex_t* reg)
case NT_STR:
{
StrNode* sn = NSTR(node);
-
if (sn->end <= sn->s)
break;
- if (exact != 0 &&
- !NSTRING_IS_RAW(node) && IS_IGNORECASE(reg->options)) {
- }
- else {
+ if (exact == 0 ||
+ NSTRING_IS_RAW(node) || !IS_IGNORECASE(reg->options)) {
n = node;
}
}
@@ -3301,6 +3300,14 @@ setup_subexp_call(Node* node, ScanEnv* env)
}
#endif
+#define IN_ALT (1<<0)
+#define IN_NOT (1<<1)
+#define IN_REPEAT (1<<2)
+#define IN_VAR_REPEAT (1<<3)
+#define IN_CALL (1<<4)
+#define IN_RECCALL (1<<5)
+#define IN_LOOK_BEHIND (1<<6)
+
/* divide different length alternatives in look-behind.
(?<=A|B) ==> (?<=A)|(?<=B)
(?<!A|B) ==> (?<!A)(?<!B)
@@ -3597,24 +3604,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
return ONIGERR_MEMORY;
}
-static int
-expand_case_fold_string(Node* node, regex_t* reg)
-{
#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
+static int
+expand_case_fold_string(Node* node, regex_t* reg, int state)
+{
int r, n, len, alt_num;
int varlen = 0;
+ int is_in_look_behind;
UChar *start, *end, *p;
Node *top_root, *root, *snode, *prev_node;
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
- StrNode* sn = NSTR(node);
+ StrNode* sn;
if (NSTRING_IS_AMBIG(node)) return 0;
+ sn = NSTR(node);
+
start = sn->s;
end = sn->end;
if (start >= end) return 0;
+ is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
+
r = 0;
top_root = root = prev_node = snode = NULL_NODE;
alt_num = 1;
@@ -3630,7 +3642,7 @@ expand_case_fold_string(Node* node, regex_t* reg)
len = enclen(reg->enc, p, end);
varlen = is_case_fold_variable_len(n, items, len);
- if (n == 0 || varlen == 0) {
+ if (n == 0 || varlen == 0 || is_in_look_behind) {
if (IS_NULL(snode)) {
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
onig_node_free(top_root);
@@ -3889,13 +3901,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
}
#endif
-#define IN_ALT (1<<0)
-#define IN_NOT (1<<1)
-#define IN_REPEAT (1<<2)
-#define IN_VAR_REPEAT (1<<3)
-#define IN_CALL (1<<4)
-#define IN_RECCALL (1<<5)
-
/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
2. expand ignore-case in char class.
@@ -3937,7 +3942,7 @@ restart:
case NT_STR:
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
- r = expand_case_fold_string(node, reg);
+ r = expand_case_fold_string(node, reg, state);
}
break;
@@ -4180,7 +4185,7 @@ restart:
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
- r = setup_tree(an->target, reg, state, env);
+ r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
@@ -4193,7 +4198,8 @@ restart:
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
- r = setup_tree(an->target, reg, (state | IN_NOT), env);
+ r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
+ env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
@@ -4209,169 +4215,73 @@ restart:
return r;
}
-#ifndef USE_SUNDAY_QUICK_SEARCH
-/* set skip map for Boyer-Moore search */
+/* set skip map for Sunday's quick search */
static int
set_bm_skip(UChar* s, UChar* end, regex_t* reg,
- UChar skip[], int** int_skip, int ignore_case)
+ UChar skip[], int ignore_case)
{
OnigDistance i, len;
int clen, flen, n, j, k;
- UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
+ UChar *p, buf[ONIGENC_MBC_CASE_FOLD_MAXLEN];
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
OnigEncoding enc = reg->enc;
len = end - s;
- if (len < ONIG_CHAR_TABLE_SIZE) {
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )len;
-
- n = 0;
- for (i = 0; i < len - 1; i += clen) {
- p = s + i;
- if (ignore_case)
- n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
- p, end, items);
- clen = enclen(enc, p, end);
- if (p + clen > end)
- clen = (int )(end - p);
-
- for (j = 0; j < n; j++) {
- if ((items[j].code_len != 1) || (items[j].byte_len != clen))
- return 1; /* different length isn't supported. */
- flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
- if (flen != clen)
- return 1; /* different length isn't supported. */
- }
- for (j = 0; j < clen; j++) {
- skip[s[i + j]] = (UChar )(len - 1 - i - j);
- for (k = 0; k < n; k++) {
- skip[buf[k][j]] = (UChar )(len - 1 - i - j);
- }
- }
- }
- }
- else {
-# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
+ if (len >= ONIG_CHAR_TABLE_SIZE) {
/* This should not happen. */
return ONIGERR_TYPE_BUG;
-# else
- if (IS_NULL(*int_skip)) {
- *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
- if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
- }
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )len;
-
- n = 0;
- for (i = 0; i < len - 1; i += clen) {
- p = s + i;
- if (ignore_case)
- n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
- p, end, items);
- clen = enclen(enc, p, end);
- if (p + clen > end)
- clen = (int )(end - p);
-
- for (j = 0; j < n; j++) {
- if ((items[j].code_len != 1) || (items[j].byte_len != clen))
- return 1; /* different length isn't supported. */
- flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
- if (flen != clen)
- return 1; /* different length isn't supported. */
- }
- for (j = 0; j < clen; j++) {
- (*int_skip)[s[i + j]] = (int )(len - 1 - i - j);
- for (k = 0; k < n; k++) {
- (*int_skip)[buf[k][j]] = (int )(len - 1 - i - j);
- }
- }
- }
-# endif
}
- return 0;
-}
-
-#else /* USE_SUNDAY_QUICK_SEARCH */
-
-/* set skip map for Sunday's quick search */
-static int
-set_bm_skip(UChar* s, UChar* end, regex_t* reg,
- UChar skip[], int** int_skip, int ignore_case)
-{
- OnigDistance i, len;
- int clen, flen, n, j, k;
- UChar *p, buf[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM][ONIGENC_MBC_CASE_FOLD_MAXLEN];
- OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
- OnigEncoding enc = reg->enc;
- len = end - s;
- if (len < ONIG_CHAR_TABLE_SIZE) {
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) skip[i] = (UChar )(len + 1);
-
- n = 0;
+ if (ignore_case) {
for (i = 0; i < len; i += clen) {
p = s + i;
- if (ignore_case)
- n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
- p, end, items);
+ n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
+ p, end, items);
clen = enclen(enc, p, end);
if (p + clen > end)
clen = (int )(end - p);
for (j = 0; j < n; j++) {
- if ((items[j].code_len != 1) || (items[j].byte_len != clen))
- return 1; /* different length isn't supported. */
- flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
- if (flen != clen)
- return 1; /* different length isn't supported. */
- }
- for (j = 0; j < clen; j++) {
- skip[s[i + j]] = (UChar )(len - i - j);
- for (k = 0; k < n; k++) {
- skip[buf[k][j]] = (UChar )(len - i - j);
- }
+ if ((items[j].code_len != 1) || (items[j].byte_len != clen)) {
+ /* Different length isn't supported. Stop optimization at here. */
+ end = p;
+ goto endcheck;
+ }
+ flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf);
+ if (flen != clen) {
+ /* Different length isn't supported. Stop optimization at here. */
+ end = p;
+ goto endcheck;
+ }
}
}
+endcheck:
+ len = end - s;
}
- else {
-# if OPT_EXACT_MAXLEN < ONIG_CHAR_TABLE_SIZE
- /* This should not happen. */
- return ONIGERR_TYPE_BUG;
-# else
- if (IS_NULL(*int_skip)) {
- *int_skip = (int* )xmalloc(sizeof(int) * ONIG_CHAR_TABLE_SIZE);
- if (IS_NULL(*int_skip)) return ONIGERR_MEMORY;
- }
- for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++) (*int_skip)[i] = (int )(len + 1);
-
- n = 0;
- for (i = 0; i < len; i += clen) {
- p = s + i;
- if (ignore_case)
- n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
- p, end, items);
- clen = enclen(enc, p, end);
- if (p + clen > end)
- clen = (int )(end - p);
- for (j = 0; j < n; j++) {
- if ((items[j].code_len != 1) || (items[j].byte_len != clen))
- return 1; /* different length isn't supported. */
- flen = ONIGENC_CODE_TO_MBC(enc, items[j].code[0], buf[j]);
- if (flen != clen)
- return 1; /* different length isn't supported. */
- }
- for (j = 0; j < clen; j++) {
- (*int_skip)[s[i + j]] = (int )(len - i - j);
- for (k = 0; k < n; k++) {
- (*int_skip)[buf[k][j]] = (int )(len - i - j);
- }
+ for (i = 0; i < ONIG_CHAR_TABLE_SIZE; i++)
+ skip[i] = (UChar )(len + 1);
+ n = 0;
+ for (i = 0; i < len; i += clen) {
+ p = s + i;
+ if (ignore_case)
+ n = ONIGENC_GET_CASE_FOLD_CODES_BY_STR(enc, reg->case_fold_flag,
+ p, end, items);
+ clen = enclen(enc, p, end);
+ if (p + clen > end)
+ clen = (int )(end - p);
+
+ for (j = 0; j < clen; j++) {
+ skip[s[i + j]] = (UChar )(len - i - j);
+ for (k = 0; k < n; k++) {
+ ONIGENC_CODE_TO_MBC(enc, items[k].code[0], buf);
+ skip[buf[j]] = (UChar )(len - i - j);
}
}
-# endif
}
- return 0;
+
+ return (int )len;
}
-#endif /* USE_SUNDAY_QUICK_SEARCH */
typedef struct {
OnigDistance min; /* min byte length */
@@ -5049,7 +4959,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
if (NSTRING_IS_DONT_GET_OPT_INFO(node)) {
int n = onigenc_strlen(env->enc, sn->s, sn->end);
- max = ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
+ max = (OnigDistance )ONIGENC_MBC_MAXLEN_DIST(env->enc) * (OnigDistance)n;
}
else {
concat_opt_exact_info_str(&opt->exb, sn->s, sn->end,
@@ -5232,7 +5142,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
r = optimize_node_left(qn->target, &nopt, env);
if (r) break;
- if (/*qn->lower == 0 &&*/ IS_REPEAT_INFINITE(qn->upper)) {
+ if (qn->lower == 0 && IS_REPEAT_INFINITE(qn->upper)) {
if (env->mmd.max == 0 &&
NTYPE(qn->target) == NT_CANY && qn->greedy) {
if (IS_MULTILINE(env->options))
@@ -5342,7 +5252,6 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env)
static int
set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
{
- int r;
int allow_reverse;
if (e->len == 0) return 0;
@@ -5357,13 +5266,22 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
if (e->ignore_case > 0) {
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
- r = set_bm_skip(reg->exact, reg->exact_end, reg,
- reg->map, &(reg->int_map), 1);
- if (r == 0) {
+ int orig_len = e->len;
+ e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
+ reg->map, 1);
+ if (e->len >= 3) {
+ reg->exact_end = reg->exact + e->len;
reg->optimize = (allow_reverse != 0
? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
}
else {
+ /* Even if BM skip table can't be built (e.g., pattern starts with
+ 's' or 'k' which have multi-byte case fold variants), we should
+ still use EXACT_IC optimization with the original pattern.
+ Without this fallback, patterns like /slackware/i have no
+ optimization at all, causing severe performance regression
+ especially with non-ASCII strings. See [Bug #21824] */
+ e->len = orig_len; /* Restore original length for EXACT_IC */
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
}
}
@@ -5373,15 +5291,10 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
}
else {
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
- r = set_bm_skip(reg->exact, reg->exact_end, reg,
- reg->map, &(reg->int_map), 0);
- if (r == 0) {
- reg->optimize = (allow_reverse != 0
- ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
- }
- else {
- reg->optimize = ONIG_OPTIMIZE_EXACT;
- }
+ set_bm_skip(reg->exact, reg->exact_end, reg,
+ reg->map, 0);
+ reg->optimize = (allow_reverse != 0
+ ? ONIG_OPTIMIZE_EXACT_BM : ONIG_OPTIMIZE_EXACT_BM_NOT_REV);
}
else {
reg->optimize = ONIG_OPTIMIZE_EXACT;
@@ -5662,8 +5575,6 @@ onig_free_body(regex_t* reg)
if (IS_NOT_NULL(reg)) {
xfree(reg->p);
xfree(reg->exact);
- xfree(reg->int_map);
- xfree(reg->int_map_backward);
xfree(reg->repeat_range);
onig_free(reg->chain);
@@ -5710,14 +5621,6 @@ onig_reg_copy(regex_t** nreg, regex_t* oreg)
(reg)->exact_end = (reg)->exact + exact_size;
}
- if (IS_NOT_NULL(reg->int_map)) {
- if (COPY_FAILED(int_map, sizeof(int) * ONIG_CHAR_TABLE_SIZE))
- goto err_int_map;
- }
- if (IS_NOT_NULL(reg->int_map_backward)) {
- if (COPY_FAILED(int_map_backward, sizeof(int) * ONIG_CHAR_TABLE_SIZE))
- goto err_int_map_backward;
- }
if (IS_NOT_NULL(reg->p)) {
if (COPY_FAILED(p, reg->alloc))
goto err_p;
@@ -5744,10 +5647,6 @@ onig_reg_copy(regex_t** nreg, regex_t* oreg)
err_repeat_range:
xfree(reg->p);
err_p:
- xfree(reg->int_map_backward);
- err_int_map_backward:
- xfree(reg->int_map);
- err_int_map:
xfree(reg->exact);
err:
xfree(reg);
@@ -5764,8 +5663,6 @@ onig_memsize(const regex_t *reg)
if (IS_NULL(reg)) return 0;
if (IS_NOT_NULL(reg->p)) size += reg->alloc;
if (IS_NOT_NULL(reg->exact)) size += reg->exact_end - reg->exact;
- if (IS_NOT_NULL(reg->int_map)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
- if (IS_NOT_NULL(reg->int_map_backward)) size += sizeof(int) * ONIG_CHAR_TABLE_SIZE;
if (IS_NOT_NULL(reg->repeat_range)) size += reg->repeat_range_alloc * sizeof(OnigRepeatRange);
if (IS_NOT_NULL(reg->chain)) size += onig_memsize(reg->chain);
@@ -6030,6 +5927,12 @@ onig_reg_init(regex_t* reg, OnigOptionType option,
if (IS_NULL(reg))
return ONIGERR_INVALID_ARGUMENT;
+ (reg)->exact = (UChar* )NULL;
+ (reg)->chain = (regex_t* )NULL;
+ (reg)->p = (UChar* )NULL;
+ (reg)->name_table = (void* )NULL;
+ (reg)->repeat_range = (OnigRepeatRange* )NULL;
+
if (ONIGENC_IS_UNDEF(enc))
return ONIGERR_DEFAULT_ENCODING_IS_NOT_SET;
@@ -6049,15 +5952,9 @@ onig_reg_init(regex_t* reg, OnigOptionType option,
(reg)->options = option;
(reg)->syntax = syntax;
(reg)->optimize = 0;
- (reg)->exact = (UChar* )NULL;
- (reg)->int_map = (int* )NULL;
- (reg)->int_map_backward = (int* )NULL;
- (reg)->chain = (regex_t* )NULL;
- (reg)->p = (UChar* )NULL;
(reg)->alloc = 0;
(reg)->used = 0;
- (reg)->name_table = (void* )NULL;
(reg)->case_fold_flag = case_fold_flag;