diff options
| author | K.Takata <kentkt@csc.jp> | 2019-01-25 18:54:41 +0900 |
|---|---|---|
| committer | nagachika <nagachika@ruby-lang.org> | 2025-11-02 14:11:36 +0900 |
| commit | f0feca1a8495eba2706a7914f0c4f8128c281366 (patch) | |
| tree | fe15428f54645b2f0338509668b2efad9827442e | |
| parent | cbc1460efbc003e256cc239a4bb228e790308ecb (diff) | |
[Bug #13671] Fix that "ss" in look-behind causes syntax error
Fixes k-takata/Onigmo#92.
This fix was ported from oniguruma:
https://github.com/kkos/oniguruma/commit/257082dac8c6019198b56324012f0bd1830ff4ba
https://github.com/k-takata/Onigmo/commit/b1a5445fbeba97b3e94a733c2ce11c033453af73
| -rw-r--r-- | regcomp.c | 37 | ||||
| -rw-r--r-- | test/ruby/test_regexp.rb | 23 |
2 files changed, 45 insertions, 15 deletions
@@ -3301,6 +3301,14 @@ setup_subexp_call(Node* node, ScanEnv* env) } #endif +#define IN_ALT (1<<0) +#define IN_NOT (1<<1) +#define IN_REPEAT (1<<2) +#define IN_VAR_REPEAT (1<<3) +#define IN_CALL (1<<4) +#define IN_RECCALL (1<<5) +#define IN_LOOK_BEHIND (1<<6) + /* divide different length alternatives in look-behind. (?<=A|B) ==> (?<=A)|(?<=B) (?<!A|B) ==> (?<!A)(?<!B) @@ -3597,24 +3605,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[], return ONIGERR_MEMORY; } -static int -expand_case_fold_string(Node* node, regex_t* reg) -{ #define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8 +static int +expand_case_fold_string(Node* node, regex_t* reg, int state) +{ int r, n, len, alt_num; int varlen = 0; + int is_in_look_behind; UChar *start, *end, *p; Node *top_root, *root, *snode, *prev_node; OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM]; - StrNode* sn = NSTR(node); + StrNode* sn; if (NSTRING_IS_AMBIG(node)) return 0; + sn = NSTR(node); + start = sn->s; end = sn->end; if (start >= end) return 0; + is_in_look_behind = (state & IN_LOOK_BEHIND) != 0; + r = 0; top_root = root = prev_node = snode = NULL_NODE; alt_num = 1; @@ -3630,7 +3643,7 @@ expand_case_fold_string(Node* node, regex_t* reg) len = enclen(reg->enc, p, end); varlen = is_case_fold_variable_len(n, items, len); - if (n == 0 || varlen == 0) { + if (n == 0 || varlen == 0 || is_in_look_behind) { if (IS_NULL(snode)) { if (IS_NULL(root) && IS_NOT_NULL(prev_node)) { onig_node_free(top_root); @@ -3889,13 +3902,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env) } #endif -#define IN_ALT (1<<0) -#define IN_NOT (1<<1) -#define IN_REPEAT (1<<2) -#define IN_VAR_REPEAT (1<<3) -#define IN_CALL (1<<4) -#define IN_RECCALL (1<<5) - /* setup_tree does the following work. 1. check empty loop. (set qn->target_empty_info) 2. expand ignore-case in char class. @@ -3937,7 +3943,7 @@ restart: case NT_STR: if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) { - r = expand_case_fold_string(node, reg); + r = expand_case_fold_string(node, reg, state); } break; @@ -4180,7 +4186,7 @@ restart: if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, state, env); + r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } @@ -4193,7 +4199,8 @@ restart: if (r < 0) return r; if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN; if (NTYPE(node) != NT_ANCHOR) goto restart; - r = setup_tree(an->target, reg, (state | IN_NOT), env); + r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND), + env); if (r != 0) return r; r = setup_look_behind(node, reg, env); } diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index 010be01960..0c9dc78fd6 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -1602,6 +1602,29 @@ class TestRegexp < Test::Unit::TestCase assert_raise(RegexpError, bug12418){ Regexp.new('(0?0|(?(5)||)|(?(5)||))?') } end + def test_ss_in_look_behind + assert_match_at("(?i:ss)", "ss", [[0, 2]]) + assert_match_at("(?i:ss)", "Ss", [[0, 2]]) + assert_match_at("(?i:ss)", "SS", [[0, 2]]) + assert_match_at("(?i:ss)", "\u017fS", [[0, 2]]) # LATIN SMALL LETTER LONG S + assert_match_at("(?i:ss)", "s\u017f", [[0, 2]]) + assert_match_at("(?i:ss)", "\u00df", [[0, 1]]) # LATIN SMALL LETTER SHARP S + assert_match_at("(?i:ss)", "\u1e9e", [[0, 1]]) # LATIN CAPITAL LETTER SHARP S + assert_match_at("(?i:xssy)", "xssy", [[0, 4]]) + assert_match_at("(?i:xssy)", "xSsy", [[0, 4]]) + assert_match_at("(?i:xssy)", "xSSy", [[0, 4]]) + assert_match_at("(?i:xssy)", "x\u017fSy", [[0, 4]]) + assert_match_at("(?i:xssy)", "xs\u017fy", [[0, 4]]) + assert_match_at("(?i:xssy)", "x\u00dfy", [[0, 3]]) + assert_match_at("(?i:xssy)", "x\u1e9ey", [[0, 3]]) + assert_match_at("(?i:\u00df)", "ss", [[0, 2]]) + assert_match_at("(?i:\u00df)", "SS", [[0, 2]]) + assert_match_at("(?i:[\u00df])", "ss", [[0, 2]]) + assert_match_at("(?i:[\u00df])", "SS", [[0, 2]]) + assert_match_at("(?i)(?<!ss)\u2728", "qq\u2728", [[2, 3]]) # Issue #92 + assert_match_at("(?i)(?<!xss)\u2728", "qq\u2728", [[2, 3]]) + end + def test_options_in_look_behind assert_nothing_raised { assert_match_at("(?<=(?i)ab)cd", "ABcd", [[2,4]]) |
