[ruby/prism] Fix assertion on spanning heredocs

https://github.com/ruby/prism/commit/e190308845
author: Kevin Newton <kddnewton@gmail.com> 2024-01-09 10:01:20 -0500
committer: git <svn-admin@ruby-lang.org> 2024-01-09 15:15:52 +0000
commit: 88d7838445ec84b1cc630ce3bd97bb71cd0aefd4 (patch)
tree: 250dce4cad8ecfde3bc8a18b6f1128796be67c87
parent: 02d8bad6e12b8614f007c8c30eb50aff4bddcfb4 (diff)
6 files changed, 223 insertions, 50 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 398288e641..7ff50630ce 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -8094,6 +8094,34 @@ pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor,
 }
 
 /**
+ * Lex past the delimiter of a percent literal. Handle newlines and heredocs
+ * appropriately.
+ */
+static uint8_t
+pm_lex_percent_delimiter(pm_parser_t *parser) {
+    size_t eol_length = match_eol(parser);
+
+    if (eol_length) {
+        if (parser->heredoc_end) {
+            // If we have already lexed a heredoc, then the newline has already
+            // been added to the list. In this case we want to just flush the
+            // heredoc end.
+            parser_flush_heredoc_end(parser);
+        } else {
+            // Otherwise, we'll add the newline to the list of newlines.
+            pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1);
+        }
+
+        const uint8_t delimiter = *parser->current.end;
+        parser->current.end += eol_length;
+
+        return delimiter;
+    }
+
+    return *parser->current.end++;
+}
+
+/**
  * This is a convenience macro that will set the current token type, call the
  * lex callback, and then return from the parser_lex function.
  */
@@ -9049,15 +9077,8 @@ parser_lex(pm_parser_t *parser) {
                                 pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
                             }
 
-                            lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
-
-                            size_t eol_length = match_eol(parser);
-                            if (eol_length) {
-                                parser->current.end += eol_length;
-                                pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
-                            } else {
-                                parser->current.end++;
-                            }
+                            const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                            lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
 
                             if (parser->current.end < parser->end) {
                                 LEX(PM_TOKEN_STRING_BEGIN);
@@ -9077,7 +9098,7 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_list(parser, false, *parser->current.end++);
+                                    lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
                                 } else {
                                     lex_mode_push_list_eof(parser);
                                 }
@@ -9088,7 +9109,7 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_list(parser, true, *parser->current.end++);
+                                    lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
                                 } else {
                                     lex_mode_push_list_eof(parser);
                                 }
@@ -9099,9 +9120,8 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_regexp(parser, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
-                                    pm_newline_list_check_append(&parser->newline_list, parser->current.end);
-                                    parser->current.end++;
+                                    const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                                    lex_mode_push_regexp(parser, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
                                 } else {
                                     lex_mode_push_regexp(parser, '\0', '\0');
                                 }
@@ -9112,9 +9132,8 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
-                                    pm_newline_list_check_append(&parser->newline_list, parser->current.end);
-                                    parser->current.end++;
+                                    const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                                    lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
                                 } else {
                                     lex_mode_push_string_eof(parser);
                                 }
@@ -9125,9 +9144,8 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
-                                    pm_newline_list_check_append(&parser->newline_list, parser->current.end);
-                                    parser->current.end++;
+                                    const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                                    lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
                                 } else {
                                     lex_mode_push_string_eof(parser);
                                 }
@@ -9138,9 +9156,9 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
+                                    const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                                    lex_mode_push_string(parser, false, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
                                     lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM);
-                                    parser->current.end++;
                                 } else {
                                     lex_mode_push_string_eof(parser);
                                 }
@@ -9151,7 +9169,7 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_list(parser, false, *parser->current.end++);
+                                    lex_mode_push_list(parser, false, pm_lex_percent_delimiter(parser));
                                 } else {
                                     lex_mode_push_list_eof(parser);
                                 }
@@ -9162,7 +9180,7 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_list(parser, true, *parser->current.end++);
+                                    lex_mode_push_list(parser, true, pm_lex_percent_delimiter(parser));
                                 } else {
                                     lex_mode_push_list_eof(parser);
                                 }
@@ -9173,8 +9191,8 @@ parser_lex(pm_parser_t *parser) {
                                 parser->current.end++;
 
                                 if (parser->current.end < parser->end) {
-                                    lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
-                                    parser->current.end++;
+                                    const uint8_t delimiter = pm_lex_percent_delimiter(parser);
+                                    lex_mode_push_string(parser, true, false, lex_mode_incrementor(delimiter), lex_mode_terminator(delimiter));
                                 } else {
                                     lex_mode_push_string_eof(parser);
                                 }
diff --git a/prism/util/pm_newline_list.c b/prism/util/pm_newline_list.c
index f27bb75b63..32a4a050fe 100644
--- a/prism/util/pm_newline_list.c
+++ b/prism/util/pm_newline_list.c
@@ -46,18 +46,6 @@ pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
 }
 
 /**
- * Conditionally append a new offset to the newline list, if the value passed in
- * is a newline.
- */
-bool
-pm_newline_list_check_append(pm_newline_list_t *list, const uint8_t *cursor) {
-    if (*cursor != '\n') {
-        return true;
-    }
-    return pm_newline_list_append(list, cursor);
-}
-
-/**
  * Returns the line and column of the given offset. If the offset is not in the
  * list, the line and column of the closest offset less than the given offset
  * are returned.
diff --git a/prism/util/pm_newline_list.h b/prism/util/pm_newline_list.h
index a31051f4e0..181283644f 100644
--- a/prism/util/pm_newline_list.h
+++ b/prism/util/pm_newline_list.h
@@ -73,17 +73,6 @@ bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t
 bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor);
 
 /**
- * Conditionally append a new offset to the newline list, if the value passed in
- * is a newline.
- *
- * @param list The list to append to.
- * @param cursor A pointer to the offset to append.
- * @return True if the reallocation of the offsets succeeds (if one was
- *     necessary), otherwise false.
- */
-bool pm_newline_list_check_append(pm_newline_list_t *list, const uint8_t *cursor);
-
-/**
  * Returns the line and column of the given offset. If the offset is not in the
  * list, the line and column of the closest offset less than the given offset
  * are returned.
diff --git a/test/prism/fixtures/spanning_heredoc_newlines.txt b/test/prism/fixtures/spanning_heredoc_newlines.txt
new file mode 100644
index 0000000000..32c9943aeb
--- /dev/null
+++ b/test/prism/fixtures/spanning_heredoc_newlines.txt
@@ -0,0 +1,23 @@
+<<A+%
+A
+
+
+<<A+%r
+A
+
+
+<<A+%q
+A
+
+
+<<A+%Q
+A
+
+
+<<A+%s
+A
+
+
+<<A+%x
+A
+
diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb
index e2de55463c..6c5d125c3d 100644
--- a/test/prism/parse_test.rb
+++ b/test/prism/parse_test.rb
@@ -211,7 +211,7 @@ module Prism
     end
 
     Dir["*.txt", base: base].each do |relative|
-      next if relative == "newline_terminated.txt"
+      next if relative == "newline_terminated.txt" || relative == "spanning_heredoc_newlines.txt"
 
       # We test every snippet (separated by \n\n) in isolation
       # to ensure the parser does not try to read bytes further than the end of each snippet
diff --git a/test/prism/snapshots/spanning_heredoc_newlines.txt b/test/prism/snapshots/spanning_heredoc_newlines.txt
new file mode 100644
index 0000000000..0753089082
--- /dev/null
+++ b/test/prism/snapshots/spanning_heredoc_newlines.txt
@@ -0,0 +1,155 @@
+@ ProgramNode (location: (1,0)-(24,0))
+├── locals: []
+└── statements:
+    @ StatementsNode (location: (1,0)-(24,0))
+    └── body: (length: 6)
+        ├── @ CallNode (location: (1,0)-(4,0))
+        │   ├── flags: ∅
+        │   ├── receiver:
+        │   │   @ StringNode (location: (1,0)-(1,3))
+        │   │   ├── flags: ∅
+        │   │   ├── opening_loc: (1,0)-(1,3) = "<<A"
+        │   │   ├── content_loc: (2,0)-(2,0) = ""
+        │   │   ├── closing_loc: (2,0)-(3,0) = "A\n"
+        │   │   └── unescaped: ""
+        │   ├── call_operator_loc: ∅
+        │   ├── name: :+
+        │   ├── message_loc: (1,3)-(1,4) = "+"
+        │   ├── opening_loc: ∅
+        │   ├── arguments:
+        │   │   @ ArgumentsNode (location: (1,4)-(4,0))
+        │   │   ├── flags: ∅
+        │   │   └── arguments: (length: 1)
+        │   │       └── @ StringNode (location: (1,4)-(4,0))
+        │   │           ├── flags: ∅
+        │   │           ├── opening_loc: (1,4)-(2,0) = "%\n"
+        │   │           ├── content_loc: (3,0)-(3,0) = ""
+        │   │           ├── closing_loc: (3,0)-(4,0) = "\n"
+        │   │           └── unescaped: ""
+        │   ├── closing_loc: ∅
+        │   └── block: ∅
+        ├── @ CallNode (location: (5,0)-(8,0))
+        │   ├── flags: ∅
+        │   ├── receiver:
+        │   │   @ StringNode (location: (5,0)-(5,3))
+        │   │   ├── flags: ∅
+        │   │   ├── opening_loc: (5,0)-(5,3) = "<<A"
+        │   │   ├── content_loc: (6,0)-(6,0) = ""
+        │   │   ├── closing_loc: (6,0)-(7,0) = "A\n"
+        │   │   └── unescaped: ""
+        │   ├── call_operator_loc: ∅
+        │   ├── name: :+
+        │   ├── message_loc: (5,3)-(5,4) = "+"
+        │   ├── opening_loc: ∅
+        │   ├── arguments:
+        │   │   @ ArgumentsNode (location: (5,4)-(8,0))
+        │   │   ├── flags: ∅
+        │   │   └── arguments: (length: 1)
+        │   │       └── @ RegularExpressionNode (location: (5,4)-(8,0))
+        │   │           ├── flags: ∅
+        │   │           ├── opening_loc: (5,4)-(6,0) = "%r\n"
+        │   │           ├── content_loc: (6,0)-(6,0) = ""
+        │   │           ├── closing_loc: (7,0)-(8,0) = "\n"
+        │   │           └── unescaped: ""
+        │   ├── closing_loc: ∅
+        │   └── block: ∅
+        ├── @ CallNode (location: (9,0)-(12,0))
+        │   ├── flags: ∅
+        │   ├── receiver:
+        │   │   @ StringNode (location: (9,0)-(9,3))
+        │   │   ├── flags: ∅
+        │   │   ├── opening_loc: (9,0)-(9,3) = "<<A"
+        │   │   ├── content_loc: (10,0)-(10,0) = ""
+        │   │   ├── closing_loc: (10,0)-(11,0) = "A\n"
+        │   │   └── unescaped: ""
+        │   ├── call_operator_loc: ∅
+        │   ├── name: :+
+        │   ├── message_loc: (9,3)-(9,4) = "+"
+        │   ├── opening_loc: ∅
+        │   ├── arguments:
+        │   │   @ ArgumentsNode (location: (9,4)-(12,0))
+        │   │   ├── flags: ∅
+        │   │   └── arguments: (length: 1)
+        │   │       └── @ StringNode (location: (9,4)-(12,0))
+        │   │           ├── flags: ∅
+        │   │           ├── opening_loc: (9,4)-(10,0) = "%q\n"
+        │   │           ├── content_loc: (11,0)-(11,0) = ""
+        │   │           ├── closing_loc: (11,0)-(12,0) = "\n"
+        │   │           └── unescaped: ""
+        │   ├── closing_loc: ∅
+        │   └── block: ∅
+        ├── @ CallNode (location: (13,0)-(16,0))
+        │   ├── flags: ∅
+        │   ├── receiver:
+        │   │   @ StringNode (location: (13,0)-(13,3))
+        │   │   ├── flags: ∅
+        │   │   ├── opening_loc: (13,0)-(13,3) = "<<A"
+        │   │   ├── content_loc: (14,0)-(14,0) = ""
+        │   │   ├── closing_loc: (14,0)-(15,0) = "A\n"
+        │   │   └── unescaped: ""
+        │   ├── call_operator_loc: ∅
+        │   ├── name: :+
+        │   ├── message_loc: (13,3)-(13,4) = "+"
+        │   ├── opening_loc: ∅
+        │   ├── arguments:
+        │   │   @ ArgumentsNode (location: (13,4)-(16,0))
+        │   │   ├── flags: ∅
+        │   │   └── arguments: (length: 1)
+        │   │       └── @ StringNode (location: (13,4)-(16,0))
+        │   │           ├── flags: ∅
+        │   │           ├── opening_loc: (13,4)-(14,0) = "%Q\n"
+        │   │           ├── content_loc: (15,0)-(15,0) = ""
+        │   │           ├── closing_loc: (15,0)-(16,0) = "\n"
+        │   │           └── unescaped: ""
+        │   ├── closing_loc: ∅
+        │   └── block: ∅
+        ├── @ CallNode (location: (17,0)-(20,0))
+        │   ├── flags: ∅
+        │   ├── receiver:
+        │   │   @ StringNode (location: (17,0)-(17,3))
+        │   │   ├── flags: ∅
+        │   │   ├── opening_loc: (17,0)-(17,3) = "<<A"
+        │   │   ├── content_loc: (18,0)-(18,0) = ""
+        │   │   ├── closing_loc: (18,0)-(19,0) = "A\n"
+        │   │   └── unescaped: ""
+        │   ├── call_operator_loc: ∅
+        │   ├── name: :+
+        │   ├── message_loc: (17,3)-(17,4) = "+"
+        │   ├── opening_loc: ∅
+        │   ├── arguments:
+        │   │   @ ArgumentsNode (location: (17,4)-(20,0))
+        │   │   ├── flags: ∅
+        │   │   └── arguments: (length: 1)
+        │   │       └── @ SymbolNode (location: (17,4)-(20,0))
+        │   │           ├── flags: ∅
+        │   │           ├── opening_loc: (17,4)-(18,0) = "%s\n"
+        │   │           ├── value_loc: (18,0)-(18,0) = ""
+        │   │           ├── closing_loc: (19,0)-(20,0) = "\n"
+        │   │           └── unescaped: ""
+        │   ├── closing_loc: ∅
+        │   └── block: ∅
+        └── @ CallNode (location: (21,0)-(24,0))
+            ├── flags: ∅
+            ├── receiver:
+            │   @ StringNode (location: (21,0)-(21,3))
+            │   ├── flags: ∅
+            │   ├── opening_loc: (21,0)-(21,3) = "<<A"
+            │   ├── content_loc: (22,0)-(22,0) = ""
+            │   ├── closing_loc: (22,0)-(23,0) = "A\n"
+            │   └── unescaped: ""
+            ├── call_operator_loc: ∅
+            ├── name: :+
+            ├── message_loc: (21,3)-(21,4) = "+"
+            ├── opening_loc: ∅
+            ├── arguments:
+            │   @ ArgumentsNode (location: (21,4)-(24,0))
+            │   ├── flags: ∅
+            │   └── arguments: (length: 1)
+            │       └── @ XStringNode (location: (21,4)-(24,0))
+            │           ├── flags: ∅
+            │           ├── opening_loc: (21,4)-(22,0) = "%x\n"
+            │           ├── content_loc: (22,0)-(22,0) = ""
+            │           ├── closing_loc: (23,0)-(24,0) = "\n"
+            │           └── unescaped: ""
+            ├── closing_loc: ∅
+            └── block: ∅
author	Kevin Newton <kddnewton@gmail.com>	2024-01-09 10:01:20 -0500
committer	git <svn-admin@ruby-lang.org>	2024-01-09 15:15:52 +0000
commit	88d7838445ec84b1cc630ce3bd97bb71cd0aefd4 (patch)
tree	250dce4cad8ecfde3bc8a18b6f1128796be67c87
parent	02d8bad6e12b8614f007c8c30eb50aff4bddcfb4 (diff)