summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris HasiƄski <krzysztof.hasinski@gmail.com>2026-01-11 14:12:07 +0100
committerNobuyoshi Nakada <nobu.nakada@gmail.com>2026-01-13 10:19:44 +0900
commit5de4cc56086493689701e86aa0ccf6a4a4a87d75 (patch)
tree013d4d95e610f34a186a76cf20106136f96772e8
parent09cd13114a7c30a7fecae1bda726a419cd1b4bf2 (diff)
Fix regexp performance regression for patterns starting with s/k
Commit 981ee02c7c ("Fix performance problem with /k/i and /s/i") was merged for Ruby 4.0 to enable partial Boyer-Moore optimization for patterns containing 's' or 'k' by using the prefix before those characters. However, when 's' or 'k' appears at the start of a pattern (no usable prefix), set_bm_skip() returns 0 and the code returned early without setting any optimization mode, leaving reg->optimize at ONIG_OPTIMIZE_NONE. This caused up to 30x slowdown for patterns like /slackware/i when matched against strings with non-ASCII characters. This patch keeps the improvement from 981ee02c7c for patterns with 3+ char prefix, while fixing the regression by falling back to ONIG_OPTIMIZE_EXACT_IC with the full pattern when the usable prefix is less than 3 characters. Before: /\bslackware\b/i with non-ASCII string: 2.24 us/op After: /\bslackware\b/i with non-ASCII string: 0.70 us/op (3.2x faster) [Bug #21824]
-rw-r--r--regcomp.c14
1 files changed, 10 insertions, 4 deletions
diff --git a/regcomp.c b/regcomp.c
index 18b2c97eb6..320cf520e9 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -5264,18 +5264,24 @@ set_optimize_exact_info(regex_t* reg, OptExactInfo* e)
if (e->ignore_case > 0) {
if (e->len >= 3 || (e->len >= 2 && allow_reverse)) {
+ int orig_len = e->len;
e->len = set_bm_skip(reg->exact, reg->exact_end, reg,
reg->map, 1);
- reg->exact_end = reg->exact + e->len;
if (e->len >= 3) {
+ reg->exact_end = reg->exact + e->len;
reg->optimize = (allow_reverse != 0
? ONIG_OPTIMIZE_EXACT_BM_IC : ONIG_OPTIMIZE_EXACT_BM_NOT_REV_IC);
}
- else if (e->len > 0) {
+ else {
+ /* Even if BM skip table can't be built (e.g., pattern starts with
+ 's' or 'k' which have multi-byte case fold variants), we should
+ still use EXACT_IC optimization with the original pattern.
+ Without this fallback, patterns like /slackware/i have no
+ optimization at all, causing severe performance regression
+ especially with non-ASCII strings. See [Bug #21824] */
+ e->len = orig_len; /* Restore original length for EXACT_IC */
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;
}
- else
- return 0;
}
else {
reg->optimize = ONIG_OPTIMIZE_EXACT_IC;