From bf1ac3f4af16edb613b6795a4af253e9d551bd2c Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:49:25 +0100 Subject: [ruby/prism] Fix lexing for unterminated strings/heredocs etc. When we hit EOF and still have lex modes left, it means some content was unterminated. Heredocs specifically have logic that needs to happen when the body finished lexing. If we don't reset the mode back to how it was before, it will not continue lexing at the correct place. Followup to https://github.com/ruby/prism/pull/3918. We can't call into `parser_lex` since it resets token locations. https://github.com/ruby/prism/commit/27c24fdc0d --- prism/prism.c | 16 ++++++- .../errors/unterminated_heredoc_and_embexpr.txt | 11 +++++ .../errors/unterminated_heredoc_and_embexpr_2.txt | 9 ++++ test/prism/errors_test.rb | 22 +++++++++ test/prism/lex_test.rb | 53 ++++++++++++++++++++-- 5 files changed, 107 insertions(+), 4 deletions(-) create mode 100644 test/prism/errors/unterminated_heredoc_and_embexpr.txt create mode 100644 test/prism/errors/unterminated_heredoc_and_embexpr_2.txt diff --git a/prism/prism.c b/prism/prism.c index 34e5d38b0a..ca0f2a55aa 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -9783,6 +9783,12 @@ parser_lex(pm_parser_t *parser) { unsigned int semantic_token_seen = parser->semantic_token_seen; parser->semantic_token_seen = true; + // We'll jump to this label when we are about to encounter an EOF. + // If we still have lex_modes on the stack, we pop them so that cleanup + // can happen. For example, we should still continue parsing after a heredoc + // identifier, even if the heredoc body was syntax invalid. + switch_lex_modes: + switch (parser->lex_modes.current->mode) { case PM_LEX_DEFAULT: case PM_LEX_EMBEXPR: @@ -9856,6 +9862,14 @@ parser_lex(pm_parser_t *parser) { // We'll check if we're at the end of the file. If we are, then we // need to return the EOF token. if (parser->current.end >= parser->end) { + // We may be missing closing tokens. We should pop modes one by one + // to do the appropriate cleanup like moving next_start for heredocs. + // Only when no mode is remaining will we actually emit the EOF token. + if (parser->lex_modes.current->mode != PM_LEX_DEFAULT) { + lex_mode_pop(parser); + goto switch_lex_modes; + } + // If we hit EOF, but the EOF came immediately after a newline, // set the start of the token to the newline. This way any EOF // errors will be reported as happening on that line rather than @@ -15433,7 +15447,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { pm_token_t opening = parser->previous; pm_statements_node_t *statements = NULL; - if (!match1(parser, PM_TOKEN_EMBEXPR_END)) { + if (!match3(parser, PM_TOKEN_EMBEXPR_END, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { pm_accepts_block_stack_push(parser, true); statements = parse_statements(parser, PM_CONTEXT_EMBEXPR, (uint16_t) (depth + 1)); pm_accepts_block_stack_pop(parser); diff --git a/test/prism/errors/unterminated_heredoc_and_embexpr.txt b/test/prism/errors/unterminated_heredoc_and_embexpr.txt new file mode 100644 index 0000000000..bed7fcd24e --- /dev/null +++ b/test/prism/errors/unterminated_heredoc_and_embexpr.txt @@ -0,0 +1,11 @@ +<= "3.3" - def test_lex_compare - prism = Prism.lex_compat(File.read(__FILE__), version: "current").value - ripper = Ripper.lex(File.read(__FILE__)) + def test_lex_compat + source = "foo bar" + prism = Prism.lex_compat(source, version: "current").value + ripper = Ripper.lex(source) assert_equal(ripper, prism) end end + + def test_lex_interpolation_unterminated + assert_equal( + %i[STRING_BEGIN EMBEXPR_BEGIN EOF], + token_types('"#{') + ) + + assert_equal( + %i[STRING_BEGIN EMBEXPR_BEGIN IGNORED_NEWLINE EOF], + token_types('"#{' + "\n") + ) + end + + def test_lex_interpolation_unterminated_with_content + # FIXME: Emits EOL twice. + assert_equal( + %i[STRING_BEGIN EMBEXPR_BEGIN CONSTANT EOF EOF], + token_types('"#{C') + ) + + assert_equal( + %i[STRING_BEGIN EMBEXPR_BEGIN CONSTANT NEWLINE EOF], + token_types('"#{C' + "\n") + ) + end + + def test_lex_heredoc_unterminated + code = <<~'RUBY'.strip + <