summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorK.Takata <kentkt@csc.jp>2019-01-25 18:54:41 +0900
committernagachika <nagachika@ruby-lang.org>2025-11-02 14:11:36 +0900
commitf0feca1a8495eba2706a7914f0c4f8128c281366 (patch)
treefe15428f54645b2f0338509668b2efad9827442e
parentcbc1460efbc003e256cc239a4bb228e790308ecb (diff)
[Bug #13671] Fix that "ss" in look-behind causes syntax error
Fixes k-takata/Onigmo#92. This fix was ported from oniguruma: https://github.com/kkos/oniguruma/commit/257082dac8c6019198b56324012f0bd1830ff4ba https://github.com/k-takata/Onigmo/commit/b1a5445fbeba97b3e94a733c2ce11c033453af73
-rw-r--r--regcomp.c37
-rw-r--r--test/ruby/test_regexp.rb23
2 files changed, 45 insertions, 15 deletions
diff --git a/regcomp.c b/regcomp.c
index d221ff34dc..ed81000310 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -3301,6 +3301,14 @@ setup_subexp_call(Node* node, ScanEnv* env)
}
#endif
+#define IN_ALT (1<<0)
+#define IN_NOT (1<<1)
+#define IN_REPEAT (1<<2)
+#define IN_VAR_REPEAT (1<<3)
+#define IN_CALL (1<<4)
+#define IN_RECCALL (1<<5)
+#define IN_LOOK_BEHIND (1<<6)
+
/* divide different length alternatives in look-behind.
(?<=A|B) ==> (?<=A)|(?<=B)
(?<!A|B) ==> (?<!A)(?<!B)
@@ -3597,24 +3605,29 @@ expand_case_fold_string_alt(int item_num, OnigCaseFoldCodeItem items[],
return ONIGERR_MEMORY;
}
-static int
-expand_case_fold_string(Node* node, regex_t* reg)
-{
#define THRESHOLD_CASE_FOLD_ALT_FOR_EXPANSION 8
+static int
+expand_case_fold_string(Node* node, regex_t* reg, int state)
+{
int r, n, len, alt_num;
int varlen = 0;
+ int is_in_look_behind;
UChar *start, *end, *p;
Node *top_root, *root, *snode, *prev_node;
OnigCaseFoldCodeItem items[ONIGENC_GET_CASE_FOLD_CODES_MAX_NUM];
- StrNode* sn = NSTR(node);
+ StrNode* sn;
if (NSTRING_IS_AMBIG(node)) return 0;
+ sn = NSTR(node);
+
start = sn->s;
end = sn->end;
if (start >= end) return 0;
+ is_in_look_behind = (state & IN_LOOK_BEHIND) != 0;
+
r = 0;
top_root = root = prev_node = snode = NULL_NODE;
alt_num = 1;
@@ -3630,7 +3643,7 @@ expand_case_fold_string(Node* node, regex_t* reg)
len = enclen(reg->enc, p, end);
varlen = is_case_fold_variable_len(n, items, len);
- if (n == 0 || varlen == 0) {
+ if (n == 0 || varlen == 0 || is_in_look_behind) {
if (IS_NULL(snode)) {
if (IS_NULL(root) && IS_NOT_NULL(prev_node)) {
onig_node_free(top_root);
@@ -3889,13 +3902,6 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
}
#endif
-#define IN_ALT (1<<0)
-#define IN_NOT (1<<1)
-#define IN_REPEAT (1<<2)
-#define IN_VAR_REPEAT (1<<3)
-#define IN_CALL (1<<4)
-#define IN_RECCALL (1<<5)
-
/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
2. expand ignore-case in char class.
@@ -3937,7 +3943,7 @@ restart:
case NT_STR:
if (IS_IGNORECASE(reg->options) && !NSTRING_IS_RAW(node)) {
- r = expand_case_fold_string(node, reg);
+ r = expand_case_fold_string(node, reg, state);
}
break;
@@ -4180,7 +4186,7 @@ restart:
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
- r = setup_tree(an->target, reg, state, env);
+ r = setup_tree(an->target, reg, (state | IN_LOOK_BEHIND), env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
@@ -4193,7 +4199,8 @@ restart:
if (r < 0) return r;
if (r > 0) return ONIGERR_INVALID_LOOK_BEHIND_PATTERN;
if (NTYPE(node) != NT_ANCHOR) goto restart;
- r = setup_tree(an->target, reg, (state | IN_NOT), env);
+ r = setup_tree(an->target, reg, (state | IN_NOT | IN_LOOK_BEHIND),
+ env);
if (r != 0) return r;
r = setup_look_behind(node, reg, env);
}
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 010be01960..0c9dc78fd6 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -1602,6 +1602,29 @@ class TestRegexp < Test::Unit::TestCase
assert_raise(RegexpError, bug12418){ Regexp.new('(0?0|(?(5)||)|(?(5)||))?') }
end
+ def test_ss_in_look_behind
+ assert_match_at("(?i:ss)", "ss", [[0, 2]])
+ assert_match_at("(?i:ss)", "Ss", [[0, 2]])
+ assert_match_at("(?i:ss)", "SS", [[0, 2]])
+ assert_match_at("(?i:ss)", "\u017fS", [[0, 2]]) # LATIN SMALL LETTER LONG S
+ assert_match_at("(?i:ss)", "s\u017f", [[0, 2]])
+ assert_match_at("(?i:ss)", "\u00df", [[0, 1]]) # LATIN SMALL LETTER SHARP S
+ assert_match_at("(?i:ss)", "\u1e9e", [[0, 1]]) # LATIN CAPITAL LETTER SHARP S
+ assert_match_at("(?i:xssy)", "xssy", [[0, 4]])
+ assert_match_at("(?i:xssy)", "xSsy", [[0, 4]])
+ assert_match_at("(?i:xssy)", "xSSy", [[0, 4]])
+ assert_match_at("(?i:xssy)", "x\u017fSy", [[0, 4]])
+ assert_match_at("(?i:xssy)", "xs\u017fy", [[0, 4]])
+ assert_match_at("(?i:xssy)", "x\u00dfy", [[0, 3]])
+ assert_match_at("(?i:xssy)", "x\u1e9ey", [[0, 3]])
+ assert_match_at("(?i:\u00df)", "ss", [[0, 2]])
+ assert_match_at("(?i:\u00df)", "SS", [[0, 2]])
+ assert_match_at("(?i:[\u00df])", "ss", [[0, 2]])
+ assert_match_at("(?i:[\u00df])", "SS", [[0, 2]])
+ assert_match_at("(?i)(?<!ss)\u2728", "qq\u2728", [[2, 3]]) # Issue #92
+ assert_match_at("(?i)(?<!xss)\u2728", "qq\u2728", [[2, 3]])
+ end
+
def test_options_in_look_behind
assert_nothing_raised {
assert_match_at("(?<=(?i)ab)cd", "ABcd", [[2,4]])