summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNARUSE, Yui <nurse@users.noreply.github.com>2024-03-21 02:13:59 +0900
committerGitHub <noreply@github.com>2024-03-20 17:13:59 +0000
commit00cb72157a60c20a9b9d9fe81fc974ea83d672b4 (patch)
tree3f5d238c1e7aabcd6222beefa5c95b5679510bd3
parentc626c201e4129bbea17583ecef73472c6f668c81 (diff)
merge revision(s) 3e6e3ca2627b1aa71b17de902cc1b8188246a828: [Backport #20207] (#10299)
Correctly handle consecutive lookarounds (#9738) Fix [Bug #20207] Fix [Bug #20212] Handling consecutive lookarounds in init_cache_opcodes is buggy, so it causes invalid memory access reported in [Bug #20207] and [Bug #20212]. This fixes it by using recursive functions to detected lookarounds nesting correctly.
-rw-r--r--regexec.c223
-rw-r--r--test/ruby/test_regexp.rb12
-rw-r--r--version.h2
3 files changed, 151 insertions, 86 deletions
diff --git a/regexec.c b/regexec.c
index fb82d32531..ed1598c497 100644
--- a/regexec.c
+++ b/regexec.c
@@ -258,18 +258,19 @@ We encode a match cache point to an integer value by the following equation:
A bit-array for memoizing (recording) match cache points once backtracked.
*/
-/* count the total number of cache opcodes for allocating a match cache buffer. */
-static OnigPosition
-count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
+static OnigPosition count_num_cache_opcodes_inner(
+ const regex_t* reg,
+ MemNumType current_repeat_mem, int lookaround_nesting,
+ UChar** pp, long* num_cache_opcodes_ptr
+)
{
- UChar* p = reg->p;
- UChar* pend = p + reg->used;
+ UChar* p = *pp;
+ UChar* pend = reg->p + reg->used;
LengthType len;
MemNumType repeat_mem;
OnigEncoding enc = reg->enc;
- MemNumType current_repeat_mem = -1;
- long num_cache_opcodes = 0;
- int lookaround_nesting = 0;
+ long num_cache_opcodes = *num_cache_opcodes_ptr;
+ OnigPosition result;
while (p < pend) {
switch (*p++) {
@@ -402,7 +403,16 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
if (reg->repeat_range[repeat_mem].lower == 0) {
num_cache_opcodes++;
}
- current_repeat_mem = repeat_mem;
+ result = count_num_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &p, &num_cache_opcodes);
+ if (result < 0 || num_cache_opcodes < 0) {
+ goto fail;
+ }
+ {
+ OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
+ if (repeat_range->lower < repeat_range->upper) {
+ num_cache_opcodes++;
+ }
+ }
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
@@ -411,14 +421,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A lone or invalid OP_REPEAT_INC is found.
goto impossible;
}
- {
- OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
- if (repeat_range->lower < repeat_range->upper) {
- num_cache_opcodes++;
- }
- current_repeat_mem = -1;
- }
- break;
+ goto exit;
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
goto impossible;
@@ -438,7 +441,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A look-around nested in a atomic grouping is found.
goto impossible;
}
- lookaround_nesting++;
+ result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
+ if (result < 0 || num_cache_opcodes < 0) {
+ goto fail;
+ }
break;
case OP_PUSH_POS_NOT:
if (lookaround_nesting < 0) {
@@ -446,7 +452,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
goto impossible;
}
p += SIZE_RELADDR;
- lookaround_nesting++;
+ result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
+ if (result < 0 || num_cache_opcodes < 0) {
+ goto fail;
+ }
break;
case OP_PUSH_LOOK_BEHIND_NOT:
if (lookaround_nesting < 0) {
@@ -455,25 +464,26 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
}
p += SIZE_RELADDR;
p += SIZE_LENGTH;
- lookaround_nesting++;
- break;
- case OP_POP_POS:
- case OP_FAIL_POS:
- case OP_FAIL_LOOK_BEHIND_NOT:
- lookaround_nesting--;
+ result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
+ if (result < 0 || num_cache_opcodes < 0) {
+ goto fail;
+ }
break;
case OP_PUSH_STOP_BT:
if (lookaround_nesting != 0) {
// A nested atomic grouping is found.
goto impossible;
}
- lookaround_nesting++;
- lookaround_nesting *= -1;
+ result = count_num_cache_opcodes_inner(reg, current_repeat_mem, -1, &p, &num_cache_opcodes);
+ if (result < 0 || num_cache_opcodes < 0) {
+ goto fail;
+ }
break;
+ case OP_POP_POS:
+ case OP_FAIL_POS:
+ case OP_FAIL_LOOK_BEHIND_NOT:
case OP_POP_STOP_BT:
- lookaround_nesting *= -1;
- lookaround_nesting--;
- break;
+ goto exit;
case OP_LOOK_BEHIND:
p += SIZE_LENGTH;
break;
@@ -507,9 +517,15 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
}
}
+exit:
+ *pp = p;
*num_cache_opcodes_ptr = num_cache_opcodes;
return 0;
+fail:
+ *num_cache_opcodes_ptr = num_cache_opcodes;
+ return result;
+
impossible:
*num_cache_opcodes_ptr = NUM_CACHE_OPCODES_IMPOSSIBLE;
return 0;
@@ -518,48 +534,49 @@ bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE;
}
-/* collect cache opcodes from the given regex program, and compute the total number of cache points. */
+/* count the total number of cache opcodes for allocating a match cache buffer. */
static OnigPosition
-init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num_cache_points_ptr)
+count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
{
- UChar* pbegin;
UChar* p = reg->p;
- UChar* pend = p + reg->used;
+ *num_cache_opcodes_ptr = 0;
+ OnigPosition result = count_num_cache_opcodes_inner(reg, -1, 0, &p, num_cache_opcodes_ptr);
+ if (result == 0 && *num_cache_opcodes_ptr >= 0 && p != reg->p + reg->used) {
+ return ONIGERR_UNDEFINED_BYTECODE;
+ }
+
+ return result;
+}
+
+static OnigPosition
+init_cache_opcodes_inner(
+ const regex_t* reg,
+ MemNumType current_repeat_mem, int lookaround_nesting,
+ OnigCacheOpcode** cache_opcodes_ptr, UChar** pp, long* num_cache_points_ptr
+)
+{
+ UChar* p = *pp;
+ UChar* pend = reg->p + reg->used;
+ UChar* pbegin;
LengthType len;
MemNumType repeat_mem;
OnigEncoding enc = reg->enc;
- MemNumType current_repeat_mem = -1;
- long cache_point = 0;
- long num_cache_points_at_repeat = 0;
- int lookaround_nesting = 0;
- const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
- OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
+ long cache_point = *num_cache_points_ptr;
+ OnigCacheOpcode *cache_opcodes = *cache_opcodes_ptr;
+ OnigPosition result;
# define INC_CACHE_OPCODES do {\
cache_opcodes->addr = pbegin;\
- cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\
+ cache_opcodes->cache_point = cache_point;\
cache_opcodes->outer_repeat_mem = current_repeat_mem;\
- cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\
+ cache_opcodes->num_cache_points_at_outer_repeat = 0;\
cache_opcodes->num_cache_points_in_outer_repeat = 0;\
cache_opcodes->lookaround_nesting = lookaround_nesting;\
+ cache_opcodes->match_addr = NULL;\
cache_point += lookaround_nesting != 0 ? 2 : 1;\
cache_opcodes++;\
} while (0)
-# define INC_LOOKAROUND_NESTING lookaround_nesting++
-# define DEC_LOOKAROUND_NESTING do {\
- UChar* match_addr = p - 1;\
- OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
- while (\
- cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
- cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
- ) {\
- cache_opcodes_in_lookaround->match_addr = match_addr;\
- cache_opcodes_in_lookaround--;\
- }\
- lookaround_nesting--;\
- } while (0)
-
while (p < pend) {
pbegin = p;
switch (*p++) {
@@ -692,33 +709,31 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
if (reg->repeat_range[repeat_mem].lower == 0) {
INC_CACHE_OPCODES;
}
- current_repeat_mem = repeat_mem;
- num_cache_points_at_repeat = cache_point;
- cache_opcodes_at_repeat = cache_opcodes;
- break;
- case OP_REPEAT_INC:
- case OP_REPEAT_INC_NG:
- GET_MEMNUM_INC(repeat_mem, p);
{
- long num_cache_points_in_repeat = cache_point - num_cache_points_at_repeat;
+ long num_cache_points_in_repeat = 0;
+ long num_cache_points_at_repeat = cache_point;
+ OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes;
+ result = init_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &cache_opcodes, &p, &num_cache_points_in_repeat);
+ if (result != 0) {
+ goto fail;
+ }
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
INC_CACHE_OPCODES;
cache_point -= lookaround_nesting != 0 ? 2 : 1;
}
- cache_point -= num_cache_points_in_repeat;
int repeat_bounds = repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower;
cache_point += num_cache_points_in_repeat * repeat_range->lower + (num_cache_points_in_repeat + (lookaround_nesting != 0 ? 2 : 1)) * repeat_bounds;
- OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes - 1;
- while (cache_opcodes_at_repeat <= cache_opcodes_in_repeat) {
+ for (; cache_opcodes_in_repeat < cache_opcodes; cache_opcodes_in_repeat++) {
+ cache_opcodes_in_repeat->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;
cache_opcodes_in_repeat->num_cache_points_in_outer_repeat = num_cache_points_in_repeat;
- cache_opcodes_in_repeat--;
}
- current_repeat_mem = -1;
- num_cache_points_at_repeat = 0;
- cache_opcodes_at_repeat = NULL;
}
break;
+ case OP_REPEAT_INC:
+ case OP_REPEAT_INC_NG:
+ p += SIZE_MEMNUM;
+ goto exit;
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
goto unexpected_bytecode_error;
@@ -734,33 +749,51 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
break;
case OP_PUSH_POS:
- INC_LOOKAROUND_NESTING;
+ lookaround:
+ {
+ OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes;
+ result = init_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &cache_opcodes, &p, &cache_point);
+ if (result != 0) {
+ goto fail;
+ }
+ UChar* match_addr = p - 1;
+ for (; cache_opcodes_in_lookaround < cache_opcodes; cache_opcodes_in_lookaround++) {
+ if (cache_opcodes_in_lookaround->match_addr == NULL) {
+ cache_opcodes_in_lookaround->match_addr = match_addr;
+ }
+ }
+ }
break;
case OP_PUSH_POS_NOT:
p += SIZE_RELADDR;
- INC_LOOKAROUND_NESTING;
- break;
+ goto lookaround;
case OP_PUSH_LOOK_BEHIND_NOT:
p += SIZE_RELADDR;
p += SIZE_LENGTH;
- INC_LOOKAROUND_NESTING;
+ goto lookaround;
+ case OP_PUSH_STOP_BT:
+ {
+ OnigCacheOpcode* cache_opcodes_in_atomic = cache_opcodes;
+ result = init_cache_opcodes_inner(reg, current_repeat_mem, -1, &cache_opcodes, &p, &cache_point);
+ if (result != 0) {
+ goto fail;
+ }
+ UChar* match_addr = p - 1;
+ for (; cache_opcodes_in_atomic < cache_opcodes; cache_opcodes_in_atomic++) {
+ if (cache_opcodes_in_atomic->match_addr == NULL) {
+ cache_opcodes_in_atomic->match_addr = match_addr;
+ }
+ }
+ }
break;
case OP_POP_POS:
case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT:
- DEC_LOOKAROUND_NESTING;
- break;
- case OP_PUSH_STOP_BT:
- INC_LOOKAROUND_NESTING;
- lookaround_nesting *= -1;
- break;
case OP_POP_STOP_BT:
- lookaround_nesting *= -1;
- DEC_LOOKAROUND_NESTING;
- break;
+ goto exit;
case OP_LOOK_BEHIND:
p += SIZE_LENGTH;
- break;
+ break;
case OP_ABSENT_END:
case OP_ABSENT:
@@ -790,15 +823,35 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
}
}
+exit:
+ *cache_opcodes_ptr = cache_opcodes;
+ *pp = p;
*num_cache_points_ptr = cache_point;
return 0;
+fail:
+ return result;
+
unexpected_bytecode_error:
return ONIGERR_UNEXPECTED_BYTECODE;
bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE;
}
+
+/* collect cache opcodes from the given regex program, and compute the total number of cache points. */
+static OnigPosition
+init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes_ptr, long* num_cache_points_ptr)
+{
+ UChar* p = reg->p;
+ *num_cache_points_ptr = 0;
+ OnigPosition result = init_cache_opcodes_inner(reg, -1, 0, &cache_opcodes_ptr, &p, num_cache_points_ptr);
+ if (result == 0 && p != reg->p + reg->used) {
+ return ONIGERR_UNDEFINED_BYTECODE;
+ }
+
+ return result;
+}
#else
static OnigPosition
count_num_cache_opcodes(regex_t* reg, long* num_cache_opcodes)
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index 4e04ccee69..84318ada31 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -2033,6 +2033,18 @@ class TestRegexp < Test::Unit::TestCase
assert /^(?:.+){2,4}?b|b/.match? "aaaabaa"
end
+ def test_bug_20207 # [Bug #20207]
+ assert(!'clan'.match?(/(?=.*a)(?!.*n)/))
+ end
+
+ def test_bug_20212 # [Bug #20212]
+ regex = Regexp.new(
+ /\A((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+)\Z/x
+ )
+ string = "www.google.com"
+ 100.times.each { assert(regex.match?(string)) }
+ end
+
def test_linear_time_p
assert_send [Regexp, :linear_time?, /a/]
assert_send [Regexp, :linear_time?, 'a']
diff --git a/version.h b/version.h
index 6e24079fb6..6096899632 100644
--- a/version.h
+++ b/version.h
@@ -11,7 +11,7 @@
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
#define RUBY_VERSION_TEENY 0
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
-#define RUBY_PATCHLEVEL 16
+#define RUBY_PATCHLEVEL 17
#include "ruby/version.h"
#include "ruby/internal/abi.h"