summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNobuyoshi Nakada <nobu@ruby-lang.org>2025-01-03 10:25:15 +0900
committerNobuyoshi Nakada <nobu@ruby-lang.org>2025-01-03 10:25:15 +0900
commit6bbb470dc77a671c67411a5d3a2564bd0a665a9c (patch)
tree9ebc955a3ae2e142bfaff933c146bf3397756541
parent77fe82286b24110292324007dafc436d01efd134 (diff)
[Bug #20504] Move dynamic regexp concatenation to iseq compiler
Notes
Notes: Merged: https://github.com/ruby/ruby/pull/12483
-rw-r--r--compile.c118
-rw-r--r--parse.y90
-rw-r--r--test/.excludes-parsey/TestM17N.rb1
-rw-r--r--test/.excludes-parsey/TestMixedUnicodeEscape.rb1
-rw-r--r--test/.excludes-parsey/TestRubyLiteral.rb1
-rw-r--r--test/ripper/test_ripper.rb1
6 files changed, 111 insertions, 101 deletions
diff --git a/compile.c b/compile.c
index 3f894cbe69..826260d182 100644
--- a/compile.c
+++ b/compile.c
@@ -3820,6 +3820,24 @@ iseq_peephole_optimize(rb_iseq_t *iseq, LINK_ELEMENT *list, const int do_tailcal
}
ELEM_REMOVE(&iobj->link);
}
+ if (IS_NEXT_INSN_ID(&iobj->link, toregexp)) {
+ INSN *next = (INSN *)iobj->link.next;
+ if (OPERAND_AT(next, 1) == INT2FIX(1)) {
+ VALUE src = OPERAND_AT(iobj, 0);
+ int opt = (int)FIX2LONG(OPERAND_AT(next, 0));
+ VALUE path = rb_iseq_path(iseq);
+ int line = iobj->insn_info.line_no;
+ VALUE errinfo = rb_errinfo();
+ VALUE re = rb_reg_compile(src, opt, RSTRING_PTR(path), line);
+ if (NIL_P(re)) {
+ VALUE message = rb_attr_get(rb_errinfo(), idMesg);
+ rb_set_errinfo(errinfo);
+ COMPILE_ERROR(iseq, line, "%" PRIsVALUE, message);
+ }
+ RB_OBJ_WRITE(iseq, &OPERAND_AT(iobj, 0), re);
+ ELEM_REMOVE(iobj->link.next);
+ }
+ }
}
if (IS_INSN_ID(iobj, concatstrings)) {
@@ -4502,47 +4520,91 @@ all_string_result_p(const NODE *node)
}
}
+struct dstr_ctxt {
+ rb_iseq_t *const iseq;
+ LINK_ANCHOR *const ret;
+ VALUE lit;
+ const NODE *lit_node;
+ int cnt;
+ int dregx;
+};
+
static int
-compile_dstr_fragments(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, int *cntp)
+append_dstr_fragment(struct dstr_ctxt *args, const NODE *const node, rb_parser_string_t *str)
{
- const struct RNode_LIST *list = RNODE_DSTR(node)->nd_next;
- VALUE lit = rb_node_dstr_string_val(node);
- LINK_ELEMENT *first_lit = 0;
- int cnt = 0;
-
- debugp_param("nd_lit", lit);
- if (!NIL_P(lit)) {
- cnt++;
- if (!RB_TYPE_P(lit, T_STRING)) {
- COMPILE_ERROR(ERROR_ARGS "dstr: must be string: %s",
- rb_builtin_type_name(TYPE(lit)));
+ VALUE s = rb_str_new_mutable_parser_string(str);
+ if (args->dregx) {
+ VALUE error = rb_reg_check_preprocess(s);
+ if (!NIL_P(error)) {
+ COMPILE_ERROR(args->iseq, nd_line(node), "%" PRIsVALUE, error);
return COMPILE_NG;
}
+ }
+ if (NIL_P(args->lit)) {
+ args->lit = s;
+ args->lit_node = node;
+ }
+ else {
+ rb_str_buf_append(args->lit, s);
+ }
+ return COMPILE_OK;
+}
+
+static void
+flush_dstr_fragment(struct dstr_ctxt *args)
+{
+ if (!NIL_P(args->lit)) {
+ rb_iseq_t *iseq = args->iseq;
+ VALUE lit = args->lit;
+ args->lit = Qnil;
lit = rb_fstring(lit);
- ADD_INSN1(ret, node, putobject, lit);
- RB_OBJ_WRITTEN(iseq, Qundef, lit);
- if (RSTRING_LEN(lit) == 0) first_lit = LAST_ELEMENT(ret);
+ ADD_INSN1(args->ret, args->lit_node, putobject, lit);
+ RB_OBJ_WRITTEN(args->iseq, Qundef, lit);
+ args->cnt++;
+ }
+}
+
+static int
+compile_dstr_fragments_0(struct dstr_ctxt *args, const NODE *const node)
+{
+ const struct RNode_LIST *list = RNODE_DSTR(node)->nd_next;
+ rb_parser_string_t *str = RNODE_DSTR(node)->string;
+
+ if (str) {
+ CHECK(append_dstr_fragment(args, node, str));
}
while (list) {
const NODE *const head = list->nd_head;
if (nd_type_p(head, NODE_STR)) {
- lit = rb_node_str_string_val(head);
- ADD_INSN1(ret, head, putobject, lit);
- RB_OBJ_WRITTEN(iseq, Qundef, lit);
- lit = Qnil;
+ CHECK(append_dstr_fragment(args, node, RNODE_STR(head)->string));
+ }
+ else if (nd_type_p(head, NODE_DSTR)) {
+ CHECK(compile_dstr_fragments_0(args, head));
}
else {
- CHECK(COMPILE(ret, "each string", head));
+ flush_dstr_fragment(args);
+ rb_iseq_t *iseq = args->iseq;
+ CHECK(COMPILE(args->ret, "each string", head));
+ args->cnt++;
}
- cnt++;
list = (struct RNode_LIST *)list->nd_next;
}
- if (NIL_P(lit) && first_lit) {
- ELEM_REMOVE(first_lit);
- --cnt;
- }
- *cntp = cnt;
+ return COMPILE_OK;
+}
+
+static int
+compile_dstr_fragments(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, int *cntp, int dregx)
+{
+ struct dstr_ctxt args = {
+ .iseq = iseq, .ret = ret,
+ .lit = Qnil, .lit_node = NULL,
+ .cnt = 0, .dregx = dregx,
+ };
+ CHECK(compile_dstr_fragments_0(&args, node));
+ flush_dstr_fragment(&args);
+
+ *cntp = args.cnt;
return COMPILE_OK;
}
@@ -4571,7 +4633,7 @@ compile_dstr(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node)
RB_OBJ_WRITTEN(iseq, Qundef, lit);
}
else {
- CHECK(compile_dstr_fragments(iseq, ret, node, &cnt));
+ CHECK(compile_dstr_fragments(iseq, ret, node, &cnt, FALSE));
ADD_INSN1(ret, node, concatstrings, INT2FIX(cnt));
}
return COMPILE_OK;
@@ -4593,7 +4655,7 @@ compile_dregx(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, i
return COMPILE_OK;
}
- CHECK(compile_dstr_fragments(iseq, ret, node, &cnt));
+ CHECK(compile_dstr_fragments(iseq, ret, node, &cnt, TRUE));
ADD_INSN2(ret, node, toregexp, INT2FIX(cflag), INT2FIX(cnt));
if (popped) {
diff --git a/parse.y b/parse.y
index 12fac7d50e..c8629c779b 100644
--- a/parse.y
+++ b/parse.y
@@ -1480,9 +1480,6 @@ static rb_ast_id_table_t *local_tbl(struct parser_params*);
static VALUE reg_compile(struct parser_params*, rb_parser_string_t*, int);
static void reg_fragment_setenc(struct parser_params*, rb_parser_string_t*, int);
-int rb_parser_reg_fragment_check(struct parser_params*, rb_parser_string_t*, int, rb_parser_reg_fragment_error_func);
-static void reg_fragment_error(struct parser_params *, VALUE);
-#define reg_fragment_check(p, str, option) rb_parser_reg_fragment_check(p, str, option, reg_fragment_error)
static int literal_concat0(struct parser_params *p, rb_parser_string_t *head, rb_parser_string_t *tail);
static NODE *heredoc_dedent(struct parser_params*,NODE*);
@@ -13161,12 +13158,26 @@ symbol_append(struct parser_params *p, NODE *symbols, NODE *symbol)
return list_append(p, symbols, symbol);
}
+static void
+dregex_fragment_setenc(struct parser_params *p, rb_node_dregx_t *const dreg, int options)
+{
+ if (dreg->string) {
+ reg_fragment_setenc(p, dreg->string, options);
+ }
+ for (struct RNode_LIST *list = dreg->nd_next; list; list = RNODE_LIST(list->nd_next)) {
+ NODE *frag = list->nd_head;
+ if (nd_type_p(frag, NODE_STR)) {
+ reg_fragment_setenc(p, RNODE_STR(frag)->string, options);
+ }
+ else if (nd_type_p(frag, NODE_DSTR)) {
+ dregex_fragment_setenc(p, RNODE_DSTR(frag), options);
+ }
+ }
+}
+
static NODE *
new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc)
{
- struct RNode_LIST *list;
- NODE *prev;
-
if (!node) {
/* Check string is valid regex */
rb_parser_string_t *str = STRING_NEW0();
@@ -13190,37 +13201,8 @@ new_regexp(struct parser_params *p, NODE *node, int options, const YYLTYPE *loc)
nd_set_loc(node, loc);
rb_node_dregx_t *const dreg = RNODE_DREGX(node);
dreg->as.nd_cflag = options & RE_OPTION_MASK;
- if (!dreg->nd_next) {
- /* Check string is valid regex */
- reg_compile(p, dreg->string, options);
- }
- else if (dreg->string) {
- reg_fragment_check(p, dreg->string, options);
- }
- prev = node;
- for (list = dreg->nd_next; list; list = RNODE_LIST(list->nd_next)) {
- NODE *frag = list->nd_head;
- enum node_type type = nd_type(frag);
- if (type == NODE_STR || (type == NODE_DSTR && !RNODE_DSTR(frag)->nd_next)) {
- rb_parser_string_t *tail = RNODE_STR(frag)->string;
- if (reg_fragment_check(p, tail, options) && prev && RNODE_DREGX(prev)->string) {
- rb_parser_string_t *lit = prev == node ? dreg->string : RNODE_STR(RNODE_LIST(prev)->nd_head)->string;
- if (!literal_concat0(p, lit, tail)) {
- return NEW_NIL(loc); /* dummy node on error */
- }
- rb_parser_str_resize(p, tail, 0);
- RNODE_LIST(prev)->nd_next = list->nd_next;
- rb_discard_node(p, list->nd_head);
- rb_discard_node(p, (NODE *)list);
- list = RNODE_LIST(prev);
- }
- else {
- prev = (NODE *)list;
- }
- }
- else {
- prev = 0;
- }
+ if (dreg->nd_next) {
+ dregex_fragment_setenc(p, dreg, options);
}
if (options & RE_OPTION_ONCE) {
node = NEW_ONCE(node, loc);
@@ -15363,13 +15345,7 @@ rb_reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int opt
rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
}
else if (rb_is_usascii_enc(p->enc)) {
- if (!rb_parser_is_ascii_string(p, str)) {
- /* raise in re.c */
- rb_parser_enc_associate(p, str, rb_usascii_encoding());
- }
- else {
- rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
- }
+ rb_parser_enc_associate(p, str, rb_ascii8bit_encoding());
}
return 0;
@@ -15385,30 +15361,6 @@ reg_fragment_setenc(struct parser_params* p, rb_parser_string_t *str, int option
if (c) reg_fragment_enc_error(p, str, c);
}
-static void
-reg_fragment_error(struct parser_params* p, VALUE err)
-{
- compile_error(p, "%"PRIsVALUE, err);
-}
-
-#ifndef RIPPER
-int
-rb_parser_reg_fragment_check(struct parser_params* p, rb_parser_string_t *str, int options, rb_parser_reg_fragment_error_func error)
-{
- VALUE err, str2;
- reg_fragment_setenc(p, str, options);
- /* TODO */
- str2 = rb_str_new_parser_string(str);
- err = rb_reg_check_preprocess(str2);
- if (err != Qnil) {
- err = rb_obj_as_string(err);
- error(p, err);
- return 0;
- }
- return 1;
-}
-#endif
-
#ifndef UNIVERSAL_PARSER
typedef struct {
struct parser_params* parser;
@@ -15507,7 +15459,7 @@ reg_compile(struct parser_params* p, rb_parser_string_t *str, int options)
if (NIL_P(re)) {
VALUE m = rb_attr_get(rb_errinfo(), idMesg);
rb_set_errinfo(err);
- reg_fragment_error(p, m);
+ compile_error(p, "%"PRIsVALUE, m);
return Qnil;
}
return re;
diff --git a/test/.excludes-parsey/TestM17N.rb b/test/.excludes-parsey/TestM17N.rb
deleted file mode 100644
index 7f8c44d02a..0000000000
--- a/test/.excludes-parsey/TestM17N.rb
+++ /dev/null
@@ -1 +0,0 @@
-exclude(:test_regexp_usascii, "https://bugs.ruby-lang.org/issues/20504")
diff --git a/test/.excludes-parsey/TestMixedUnicodeEscape.rb b/test/.excludes-parsey/TestMixedUnicodeEscape.rb
deleted file mode 100644
index 7bf964ebf1..0000000000
--- a/test/.excludes-parsey/TestMixedUnicodeEscape.rb
+++ /dev/null
@@ -1 +0,0 @@
-exclude(:test_basic, "https://bugs.ruby-lang.org/issues/20504")
diff --git a/test/.excludes-parsey/TestRubyLiteral.rb b/test/.excludes-parsey/TestRubyLiteral.rb
deleted file mode 100644
index 853f23a3b9..0000000000
--- a/test/.excludes-parsey/TestRubyLiteral.rb
+++ /dev/null
@@ -1 +0,0 @@
-exclude(:test_dregexp, "https://bugs.ruby-lang.org/issues/20504")
diff --git a/test/ripper/test_ripper.rb b/test/ripper/test_ripper.rb
index 5ca79e136f..070023b536 100644
--- a/test/ripper/test_ripper.rb
+++ b/test/ripper/test_ripper.rb
@@ -164,7 +164,6 @@ end
assert_equal([[1, 8], :on_tstring_end, "\"", state(:EXPR_END)], lex.shift)
assert_equal([[1, 9], :on_embexpr_end, "}", state(:EXPR_END)], lex.shift)
assert_equal([[1, 10], :on_regexp_end, "/", state(:EXPR_BEG)], lex.shift)
- assert_equal([[1, 11], :compile_error, "", state(:EXPR_END), "invalid multibyte character: /\\xCD/"], lex.shift)
assert_empty(lex)
end