diff options
author | Haldun Bayhantopcu <haldun@github.com> | 2023-12-01 19:59:50 +0100 |
---|---|---|
committer | git <svn-admin@ruby-lang.org> | 2023-12-01 20:10:58 +0000 |
commit | 562d949e022cb3e7288256af8d0df3f4e17b66cb (patch) | |
tree | a851552964f82540eb2e4f7ffb37e6df8610b500 | |
parent | 39238888bc784eb5887d899dc09fad30997464ac (diff) |
[ruby/prism] Fix parsing heredoc ends
https://github.com/ruby/prism/commit/aa8c702271
-rw-r--r-- | prism/prism.c | 95 | ||||
-rw-r--r-- | test/prism/fixtures/heredocs_leading_whitespace.txt | 19 | ||||
-rw-r--r-- | test/prism/locals_test.rb | 7 | ||||
-rw-r--r-- | test/prism/parse_test.rb | 5 | ||||
-rw-r--r-- | test/prism/snapshots/heredocs_leading_whitespace.txt | 49 |
5 files changed, 147 insertions, 28 deletions
diff --git a/prism/prism.c b/prism/prism.c index 21dfd58c67..9021f5b0f8 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -9761,24 +9761,43 @@ parser_lex(pm_parser_t *parser) { // terminator, then we need to return the ending of the heredoc. if (current_token_starts_line(parser)) { const uint8_t *start = parser->current.start; - size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); - - if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) { - bool matched = true; + if (start + ident_length <= parser->end) { bool at_end = false; + const uint8_t *newline = next_newline(start, parser->end - start); + const uint8_t *ident_end = newline; + const uint8_t *terminator_end = newline; - size_t eol_length = match_eol_at(parser, start + ident_length); - if (eol_length) { - parser->current.end = start + ident_length + eol_length; - pm_newline_list_append(&parser->newline_list, parser->current.end - 1); - } else if (parser->end == (start + ident_length)) { - parser->current.end = start + ident_length; + if (newline == NULL) { + terminator_end = parser->end; + ident_end = parser->end; at_end = true; } else { - matched = false; + terminator_end++; + if (newline[-1] == '\r') { + ident_end--; // Remove \r + } + } + + const uint8_t *terminator_start = ident_end - ident_length; + const uint8_t *cursor = start; + + if ( + lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH || + lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE + ) { + while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) { + cursor++; + } } - if (matched) { + if ( + (cursor == terminator_start) && + (memcmp(terminator_start, ident_start, ident_length) == 0) + ) { + if (newline != NULL) { + pm_newline_list_append(&parser->newline_list, newline); + } + parser->current.end = terminator_end; if (*lex_mode->as.heredoc.next_start == '\\') { parser->next_start = NULL; } else { @@ -9794,7 +9813,7 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_HEREDOC_END); } } - + size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); if ( lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE && (lex_mode->as.heredoc.common_whitespace > whitespace) && @@ -9838,23 +9857,35 @@ parser_lex(pm_parser_t *parser) { // If we have a - or ~ heredoc, then we can match after // some leading whitespace. const uint8_t *start = breakpoint + 1; - size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); - // If we have hit a newline that is followed by a valid - // terminator, then we need to return the content of the - // heredoc here as string content. Then, the next time a - // token is lexed, it will match again and return the - // end of the heredoc. - if ( - !was_escaped_newline && - (start + ident_length <= parser->end) && - (memcmp(start, ident_start, ident_length) == 0) - ) { - // Heredoc terminators must be followed by a - // newline, CRLF, or EOF to be valid. + if (!was_escaped_newline && (start + ident_length <= parser->end)) { + // We want to match the terminator starting from the end of the line in case + // there is whitespace in the ident such as <<-' DOC' or <<~' DOC'. + const uint8_t *newline = next_newline(start, parser->end - start); + + if (newline == NULL) { + newline = parser->end; + } else if (newline[-1] == '\r') { + newline--; // Remove \r + } + + // Start of a possible terminator. + const uint8_t *terminator_start = newline - ident_length; + + // Cursor to check for the leading whitespace. We skip the + // leading whitespace if we have a - or ~ heredoc. + const uint8_t *cursor = start; + + if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH || + lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { + while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) { + cursor++; + } + } + if ( - start + ident_length == parser->end || - match_eol_at(parser, start + ident_length) + cursor == terminator_start && + (memcmp(terminator_start, ident_start, ident_length) == 0) ) { parser->current.end = breakpoint + 1; pm_token_buffer_flush(parser, &token_buffer); @@ -9862,6 +9893,14 @@ parser_lex(pm_parser_t *parser) { } } + size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent); + + // If we have hit a newline that is followed by a valid + // terminator, then we need to return the content of the + // heredoc here as string content. Then, the next time a + // token is lexed, it will match again and return the + // end of the heredoc. + if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') { lex_mode->as.heredoc.common_whitespace = whitespace; diff --git a/test/prism/fixtures/heredocs_leading_whitespace.txt b/test/prism/fixtures/heredocs_leading_whitespace.txt new file mode 100644 index 0000000000..e786f08774 --- /dev/null +++ b/test/prism/fixtures/heredocs_leading_whitespace.txt @@ -0,0 +1,19 @@ +<<-' FOO' +a +b + FOO + +<<-' FOO' +a +b + FOO + +<<~' FOO' +a +b + FOO + +<<~' FOO' +a +b + FOO diff --git a/test/prism/locals_test.rb b/test/prism/locals_test.rb index 06324f9d94..df391ca048 100644 --- a/test/prism/locals_test.rb +++ b/test/prism/locals_test.rb @@ -68,6 +68,13 @@ module Prism # HERE todos << "seattlerb/heredoc_nested.txt" + # Ruby < 3.3.0 fails to parse: + # + # <<-' HERE' + # foo + # HERE + invalid << "heredocs_leading_whitespace.txt" if RUBY_VERSION < "3.3.0" + base = File.join(__dir__, "fixtures") skips = invalid | todos diff --git a/test/prism/parse_test.rb b/test/prism/parse_test.rb index 2feb15b48b..e2de55463c 100644 --- a/test/prism/parse_test.rb +++ b/test/prism/parse_test.rb @@ -111,6 +111,11 @@ module Prism # Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false. ripper_should_parse = false if relative == "spanning_heredoc.txt" + # Ruby < 3.3.0 cannot parse heredocs where there are leading whitespace charactes in the heredoc start. + # Example: <<~' EOF' or <<-' EOF' + # https://bugs.ruby-lang.org/issues/19539 + ripper_should_parse = false if relative == "heredocs_leading_whitespace.txt" && RUBY_VERSION < "3.3.0" + define_method "test_filepath_#{relative}" do # First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows, # and explicitly set the external encoding to UTF-8 to override the binmode default. diff --git a/test/prism/snapshots/heredocs_leading_whitespace.txt b/test/prism/snapshots/heredocs_leading_whitespace.txt new file mode 100644 index 0000000000..06116821ca --- /dev/null +++ b/test/prism/snapshots/heredocs_leading_whitespace.txt @@ -0,0 +1,49 @@ +@ ProgramNode (location: (1,0)-(16,10)) +├── locals: [] +└── statements: + @ StatementsNode (location: (1,0)-(16,10)) + └── body: (length: 4) + ├── @ StringNode (location: (1,0)-(1,10)) + │ ├── flags: ∅ + │ ├── opening_loc: (1,0)-(1,10) = "<<-' FOO'" + │ ├── content_loc: (2,0)-(4,0) = "a\nb\n" + │ ├── closing_loc: (4,0)-(5,0) = " FOO\n" + │ └── unescaped: "a\nb\n" + ├── @ StringNode (location: (6,0)-(6,10)) + │ ├── flags: ∅ + │ ├── opening_loc: (6,0)-(6,10) = "<<-' FOO'" + │ ├── content_loc: (7,0)-(9,0) = "a\nb\n" + │ ├── closing_loc: (9,0)-(10,0) = " FOO\n" + │ └── unescaped: "a\nb\n" + ├── @ InterpolatedStringNode (location: (11,0)-(11,10)) + │ ├── opening_loc: (11,0)-(11,10) = "<<~' FOO'" + │ ├── parts: (length: 2) + │ │ ├── @ StringNode (location: (12,0)-(13,0)) + │ │ │ ├── flags: ∅ + │ │ │ ├── opening_loc: ∅ + │ │ │ ├── content_loc: (12,0)-(13,0) = "a\n" + │ │ │ ├── closing_loc: ∅ + │ │ │ └── unescaped: "a\n" + │ │ └── @ StringNode (location: (13,0)-(14,0)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: ∅ + │ │ ├── content_loc: (13,0)-(14,0) = "b\n" + │ │ ├── closing_loc: ∅ + │ │ └── unescaped: "b\n" + │ └── closing_loc: (14,0)-(15,0) = " FOO\n" + └── @ InterpolatedStringNode (location: (16,0)-(16,10)) + ├── opening_loc: (16,0)-(16,10) = "<<~' FOO'" + ├── parts: (length: 2) + │ ├── @ StringNode (location: (17,0)-(18,0)) + │ │ ├── flags: ∅ + │ │ ├── opening_loc: ∅ + │ │ ├── content_loc: (17,0)-(18,0) = "a\n" + │ │ ├── closing_loc: ∅ + │ │ └── unescaped: "a\n" + │ └── @ StringNode (location: (18,0)-(19,0)) + │ ├── flags: ∅ + │ ├── opening_loc: ∅ + │ ├── content_loc: (18,0)-(19,0) = "b\n" + │ ├── closing_loc: ∅ + │ └── unescaped: "b\n" + └── closing_loc: (19,0)-(20,0) = " FOO\n" |