summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Dalessio <mike.dalessio@shopify.com>2023-08-25 10:12:13 -0400
committergit <svn-admin@ruby-lang.org>2023-08-25 18:20:49 +0000
commit3525c460f9a916f8089cbeca65fc3e893ca5d633 (patch)
tree00744e62ff3ac00e2875c207415fbfe5997620e7
parentbf3d48e18261595ac19057319f378bfd44928045 (diff)
[ruby/yarp] fix: regexes and strings with escaped newline around a heredoc
Found via the fuzzer. https://github.com/ruby/yarp/commit/501757135a Co-authored-by: Kevin Newton <kddnewton@gmail.com>
-rw-r--r--test/yarp/fixtures/wrapping_heredoc.txt13
-rw-r--r--test/yarp/parse_test.rb4
-rw-r--r--test/yarp/snapshots/wrapping_heredoc.txt80
-rw-r--r--yarp/yarp.c40
4 files changed, 130 insertions, 7 deletions
diff --git a/test/yarp/fixtures/wrapping_heredoc.txt b/test/yarp/fixtures/wrapping_heredoc.txt
new file mode 100644
index 0000000000..d5fc710178
--- /dev/null
+++ b/test/yarp/fixtures/wrapping_heredoc.txt
@@ -0,0 +1,13 @@
+# test regex, string, and lists that wrap a heredoc thanks to an escaped newline
+
+# ripper incorrectly creates a "b\nc" string instead of two separate string tokens
+pp <<-A.gsub(/b\
+a
+A
+c/, "")
+
+# ripper incorrectly creates a "e\nf" string instead of two separate string tokens
+pp <<-A + "e\
+d
+A
+f"
diff --git a/test/yarp/parse_test.rb b/test/yarp/parse_test.rb
index c0f3ecf551..f8c1fe12d1 100644
--- a/test/yarp/parse_test.rb
+++ b/test/yarp/parse_test.rb
@@ -112,6 +112,10 @@ class ParseTest < Test::Unit::TestCase
# Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
return if relative == "seattlerb/heredoc_nested.txt"
+ # Ripper seems to have a bug that the regex portions before and after the heredoc are combined
+ # into a single token.
+ return if relative == "wrapping_heredoc.txt"
+
# Finally, assert that we can lex the source and get the same tokens as
# Ripper.
lex_result = YARP.lex_compat(source)
diff --git a/test/yarp/snapshots/wrapping_heredoc.txt b/test/yarp/snapshots/wrapping_heredoc.txt
new file mode 100644
index 0000000000..674db56ed1
--- /dev/null
+++ b/test/yarp/snapshots/wrapping_heredoc.txt
@@ -0,0 +1,80 @@
+ProgramNode(165...298)(
+ [],
+ StatementsNode(165...298)(
+ [CallNode(165...193)(
+ nil,
+ nil,
+ (165...167),
+ nil,
+ ArgumentsNode(168...193)(
+ [CallNode(168...193)(
+ InterpolatedStringNode(168...172)(
+ (168...172),
+ [StringNode(182...184)(nil, (182...184), nil, "a\n")],
+ (184...186)
+ ),
+ (172...173),
+ (173...177),
+ (177...178),
+ ArgumentsNode(178...192)(
+ [InterpolatedRegularExpressionNode(178...188)(
+ (178...179),
+ [StringNode(179...182)(nil, (179...182), nil, "b"),
+ StringNode(186...187)(nil, (186...187), nil, "c")],
+ (187...188),
+ 0
+ ),
+ StringNode(190...192)(
+ (190...191),
+ (191...191),
+ (191...192),
+ ""
+ )]
+ ),
+ (192...193),
+ nil,
+ 0,
+ "gsub"
+ )]
+ ),
+ nil,
+ nil,
+ 0,
+ "pp"
+ ),
+ CallNode(278...298)(
+ nil,
+ nil,
+ (278...280),
+ nil,
+ ArgumentsNode(281...298)(
+ [CallNode(281...298)(
+ InterpolatedStringNode(281...285)(
+ (281...285),
+ [StringNode(292...294)(nil, (292...294), nil, "d\n")],
+ (294...296)
+ ),
+ nil,
+ (286...287),
+ nil,
+ ArgumentsNode(288...298)(
+ [InterpolatedStringNode(288...298)(
+ (288...289),
+ [StringNode(289...292)(nil, (289...292), nil, "e"),
+ StringNode(296...297)(nil, (296...297), nil, "f")],
+ (297...298)
+ )]
+ ),
+ nil,
+ nil,
+ 0,
+ "+"
+ )]
+ ),
+ nil,
+ nil,
+ 0,
+ "pp"
+ )]
+ )
+)
diff --git a/yarp/yarp.c b/yarp/yarp.c
index a7b2290aa2..fdcc303b7f 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -6614,7 +6614,13 @@ parser_lex(yp_parser_t *parser) {
case YP_LEX_REGEXP: {
// First, we'll set to start of this token to be the current end.
- parser->current.start = parser->current.end;
+ if (parser->next_start == NULL) {
+ parser->current.start = parser->current.end;
+ } else {
+ parser->current.start = parser->next_start;
+ parser->current.end = parser->next_start;
+ parser->next_start = NULL;
+ }
// We'll check if we're at the end of the file. If we are, then we need to
// return the EOF token.
@@ -6693,9 +6699,19 @@ parser_lex(yp_parser_t *parser) {
if (*breakpoint == '\\') {
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
- // If the result is an escaped newline, then we need to
- // track that newline.
- yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
+ // If the result is an escaped newline ...
+ if (*(breakpoint + difference - 1) == '\n') {
+ if (parser->heredoc_end) {
+ // ... if we are on the same line as a heredoc, flush the heredoc and
+ // continue parsing after heredoc_end.
+ parser->current.end = breakpoint + difference;
+ parser_flush_heredoc_end(parser);
+ LEX(YP_TOKEN_STRING_CONTENT);
+ } else {
+ // ... else track the newline.
+ yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
+ }
+ }
breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
continue;
@@ -6833,9 +6849,19 @@ parser_lex(yp_parser_t *parser) {
yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
- // If the result is an escaped newline, then we need to
- // track that newline.
- yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
+ // If the result is an escaped newline ...
+ if (*(breakpoint + difference - 1) == '\n') {
+ if (parser->heredoc_end) {
+ // ... if we are on the same line as a heredoc, flush the heredoc and
+ // continue parsing after heredoc_end.
+ parser->current.end = breakpoint + difference;
+ parser_flush_heredoc_end(parser);
+ LEX(YP_TOKEN_STRING_CONTENT);
+ } else {
+ // ... else track the newline.
+ yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
+ }
+ }
breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
break;