diff options
| -rw-r--r-- | prism/prism.c | 132 | ||||
| -rw-r--r-- | test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt | 2 | ||||
| -rw-r--r-- | test/prism/snapshots/whitequark/ruby_bug_11989.txt | 2 |
3 files changed, 101 insertions, 35 deletions
diff --git a/prism/prism.c b/prism/prism.c index 55d0c92760..c9cfe65d76 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -8426,14 +8426,15 @@ parser_lex(pm_parser_t *parser) { // Now let's grab the information about the identifier off of the current // lex mode. - const uint8_t *ident_start = parser->lex_modes.current->as.heredoc.ident_start; - size_t ident_length = parser->lex_modes.current->as.heredoc.ident_length; + pm_lex_mode_t *lex_mode = parser->lex_modes.current; + const uint8_t *ident_start = lex_mode->as.heredoc.ident_start; + size_t ident_length = lex_mode->as.heredoc.ident_length; // If we are immediately following a newline and we have hit the // terminator, then we need to return the ending of the heredoc. if (current_token_starts_line(parser)) { const uint8_t *start = parser->current.start; - if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) { + if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) { start += pm_strspn_inline_whitespace(start, parser->end - start); } @@ -8453,10 +8454,10 @@ parser_lex(pm_parser_t *parser) { } if (matched) { - if (*parser->lex_modes.current->as.heredoc.next_start == '\\') { + if (*lex_mode->as.heredoc.next_start == '\\') { parser->next_start = NULL; } else { - parser->next_start = parser->lex_modes.current->as.heredoc.next_start; + parser->next_start = lex_mode->as.heredoc.next_start; parser->heredoc_end = parser->current.end; } @@ -8469,17 +8470,18 @@ parser_lex(pm_parser_t *parser) { } } - // Otherwise we'll be parsing string content. These are the places where - // we need to split up the content of the heredoc. We'll use strpbrk to - // find the first of these characters. + // Otherwise we'll be parsing string content. These are the places + // where we need to split up the content of the heredoc. We'll use + // strpbrk to find the first of these characters. uint8_t breakpoints[] = "\n\\#"; - pm_heredoc_quote_t quote = parser->lex_modes.current->as.heredoc.quote; + pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote; if (quote == PM_HEREDOC_QUOTE_SINGLE) { breakpoints[2] = '\0'; } const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + pm_token_buffer_t token_buffer = { 0 }; while (breakpoint != NULL) { switch (*breakpoint) { @@ -8491,13 +8493,14 @@ parser_lex(pm_parser_t *parser) { if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) { parser_flush_heredoc_end(parser); parser->current.end = breakpoint + 1; + pm_token_buffer_flush(parser, &token_buffer); LEX(PM_TOKEN_STRING_CONTENT); } pm_newline_list_append(&parser->newline_list, breakpoint); const uint8_t *start = breakpoint + 1; - if (parser->lex_modes.current->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) { + if (lex_mode->as.heredoc.indent != PM_HEREDOC_INDENT_NONE) { start += pm_strspn_inline_whitespace(start, parser->end - start); } @@ -8515,6 +8518,7 @@ parser_lex(pm_parser_t *parser) { match_eol_at(parser, start + ident_length) ) { parser->current.end = breakpoint + 1; + pm_token_buffer_flush(parser, &token_buffer); LEX(PM_TOKEN_STRING_CONTENT); } } @@ -8531,37 +8535,83 @@ parser_lex(pm_parser_t *parser) { // stop looping before the newline and not after the // newline so that we can still potentially find the // terminator of the heredoc. - size_t eol_length = match_eol_at(parser, breakpoint + 1); - if (eol_length) { - breakpoint += eol_length; - } else { - pm_unescape_type_t unescape_type = (quote == PM_HEREDOC_QUOTE_SINGLE) ? PM_UNESCAPE_MINIMAL : PM_UNESCAPE_ALL; - size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type); - if (difference == 0) { - // we're at the end of the file - breakpoint = NULL; - break; - } + parser->current.end = breakpoint + 1; + pm_token_buffer_escape(parser, &token_buffer); - pm_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); + // If we've hit the end of the file, then break out of + // the loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; + continue; + } - breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); + uint8_t peeked = peek(parser); + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push(&token_buffer, '\r'); + break; + } + /* fallthrough */ + case '\n': + // If this is a dedenting heredoc then we need + // to leave the escaped newline in place so that + // it can be removed later when we dedent the + // heredoc. + if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { + pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, '\n'); + } + + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + default: + if (quote == PM_HEREDOC_QUOTE_SINGLE) { + pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, peeked); + parser->current.end++; + } else { + escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_NONE); + } + + break; } + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); break; } case '#': { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type != PM_TOKEN_NOT_PROVIDED) { - LEX(type); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class + // or instance variable like "#@" but wasn't + // actually. In this case we'll just skip to the + // next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + break; } - // If we haven't returned at this point then we had something - // that looked like an interpolated class or instance variable - // like "#@" but wasn't actually. In this case we'll just skip - // to the next breakpoint. - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); - break; + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); } default: assert(false && "unreachable"); @@ -12499,7 +12549,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { node->location.end = opening.end; } else { - part = parse_string_part(parser); + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + parser_lex(parser); + } else { + part = parse_string_part(parser); + } if (part == NULL) { // If we get here, then we tried to find something in the @@ -12539,7 +12596,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { pm_node_list_append(&parts, part); while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { - if ((part = parse_string_part(parser)) != NULL) { + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + parser_lex(parser); + } else { + part = parse_string_part(parser); + } + + if (part != NULL) { pm_node_list_append(&parts, part); } } diff --git a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt index 20496c3cfc..ad38f53cee 100644 --- a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt +++ b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt @@ -114,7 +114,7 @@ │ ├── opening_loc: (51,0)-(51,9) = "<<-'HERE'" │ ├── content_loc: (52,0)-(53,0) = "a\\\nb\n" │ ├── closing_loc: (54,0)-(54,0) = "HERE\n" - │ └── unescaped: "a\\\nb\n" + │ └── unescaped: "ab\n" ├── @ XStringNode (location: (56,0)-(56,9)) │ ├── opening_loc: (56,0)-(56,9) = "<<-`HERE`" │ ├── content_loc: (57,0)-(58,0) = "a\\\nb\n" diff --git a/test/prism/snapshots/whitequark/ruby_bug_11989.txt b/test/prism/snapshots/whitequark/ruby_bug_11989.txt index 2d56025693..27ec4058af 100644 --- a/test/prism/snapshots/whitequark/ruby_bug_11989.txt +++ b/test/prism/snapshots/whitequark/ruby_bug_11989.txt @@ -16,7 +16,7 @@ │ ├── opening_loc: (1,2)-(1,8) = "<<~\"E\"" │ ├── content_loc: (2,0)-(2,0) = " x\\n y\n" │ ├── closing_loc: (3,0)-(3,0) = "E\n" - │ └── unescaped: "x\n y\n" + │ └── unescaped: "x\n y\n" ├── closing_loc: ∅ ├── block: ∅ ├── flags: ∅ |
