[ruby/prism] Fix parsing heredoc ends

https://github.com/ruby/prism/commit/aa8c702271
author: Haldun Bayhantopcu <haldun@github.com> 2023-12-01 19:59:50 +0100
committer: git <svn-admin@ruby-lang.org> 2023-12-01 20:10:58 +0000
commit: 562d949e022cb3e7288256af8d0df3f4e17b66cb (patch)
tree: a851552964f82540eb2e4f7ffb37e6df8610b500 /prism
parent: 39238888bc784eb5887d899dc09fad30997464ac (diff)
1 files changed, 67 insertions, 28 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 21dfd58c67..9021f5b0f8 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -9761,24 +9761,43 @@ parser_lex(pm_parser_t *parser) {
             // terminator, then we need to return the ending of the heredoc.
             if (current_token_starts_line(parser)) {
                 const uint8_t *start = parser->current.start;
-                size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
-
-                if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
-                    bool matched = true;
+                if (start + ident_length <= parser->end) {
                     bool at_end = false;
+                    const uint8_t *newline = next_newline(start, parser->end - start);
+                    const uint8_t *ident_end = newline;
+                    const uint8_t *terminator_end = newline;
 
-                    size_t eol_length = match_eol_at(parser, start + ident_length);
-                    if (eol_length) {
-                        parser->current.end = start + ident_length + eol_length;
-                        pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
-                    } else if (parser->end == (start + ident_length)) {
-                        parser->current.end = start + ident_length;
+                    if (newline == NULL) {
+                        terminator_end = parser->end;
+                        ident_end = parser->end;
                         at_end = true;
                     } else {
-                        matched = false;
+                        terminator_end++;
+                        if (newline[-1] == '\r') {
+                            ident_end--; // Remove \r
+                        }
+                    }
+
+                    const uint8_t *terminator_start = ident_end - ident_length;
+                    const uint8_t *cursor = start;
+
+                    if (
+                        lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
+                        lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
+                    ) {
+                        while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
+                            cursor++;
+                        }
                     }
 
-                    if (matched) {
+                    if (
+                        (cursor == terminator_start) &&
+                        (memcmp(terminator_start, ident_start, ident_length) == 0)
+                    ) {
+                        if (newline != NULL) {
+                            pm_newline_list_append(&parser->newline_list, newline);
+                        }
+                        parser->current.end = terminator_end;
                         if (*lex_mode->as.heredoc.next_start == '\\') {
                             parser->next_start = NULL;
                         } else {
@@ -9794,7 +9813,7 @@ parser_lex(pm_parser_t *parser) {
                         LEX(PM_TOKEN_HEREDOC_END);
                     }
                 }
-
+                size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
                 if (
                     lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
                     (lex_mode->as.heredoc.common_whitespace > whitespace) &&
@@ -9838,23 +9857,35 @@ parser_lex(pm_parser_t *parser) {
                         // If we have a - or ~ heredoc, then we can match after
                         // some leading whitespace.
                         const uint8_t *start = breakpoint + 1;
-                        size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
 
-                        // If we have hit a newline that is followed by a valid
-                        // terminator, then we need to return the content of the
-                        // heredoc here as string content. Then, the next time a
-                        // token is lexed, it will match again and return the
-                        // end of the heredoc.
-                        if (
-                            !was_escaped_newline &&
-                            (start + ident_length <= parser->end) &&
-                            (memcmp(start, ident_start, ident_length) == 0)
-                        ) {
-                            // Heredoc terminators must be followed by a
-                            // newline, CRLF, or EOF to be valid.
+                        if (!was_escaped_newline && (start + ident_length <= parser->end)) {
+                            // We want to match the terminator starting from the end of the line in case
+                            // there is whitespace in the ident such as <<-'   DOC' or <<~'   DOC'.
+                            const uint8_t *newline = next_newline(start, parser->end - start);
+
+                            if (newline == NULL) {
+                                newline = parser->end;
+                            } else if (newline[-1] == '\r') {
+                                newline--; // Remove \r
+                            }
+
+                            // Start of a possible terminator.
+                            const uint8_t *terminator_start = newline - ident_length;
+
+                            // Cursor to check for the leading whitespace. We skip the
+                            // leading whitespace if we have a - or ~ heredoc.
+                            const uint8_t *cursor = start;
+
+                            if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
+                                lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
+                                while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
+                                    cursor++;
+                                }
+                            }
+
                             if (
-                                start + ident_length == parser->end ||
-                                match_eol_at(parser, start + ident_length)
+                                cursor == terminator_start &&
+                                (memcmp(terminator_start, ident_start, ident_length) == 0)
                             ) {
                                 parser->current.end = breakpoint + 1;
                                 pm_token_buffer_flush(parser, &token_buffer);
@@ -9862,6 +9893,14 @@ parser_lex(pm_parser_t *parser) {
                             }
                         }
 
+                        size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
+
+                        // If we have hit a newline that is followed by a valid
+                        // terminator, then we need to return the content of the
+                        // heredoc here as string content. Then, the next time a
+                        // token is lexed, it will match again and return the
+                        // end of the heredoc.
+
                         if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
                             if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
                                 lex_mode->as.heredoc.common_whitespace = whitespace;
author	Haldun Bayhantopcu <haldun@github.com>	2023-12-01 19:59:50 +0100
committer	git <svn-admin@ruby-lang.org>	2023-12-01 20:10:58 +0000
commit	562d949e022cb3e7288256af8d0df3f4e17b66cb (patch)
tree	a851552964f82540eb2e4f7ffb37e6df8610b500 /prism
parent	39238888bc784eb5887d899dc09fad30997464ac (diff)