diff options
author | Kevin Newton <kddnewton@gmail.com> | 2023-10-12 08:46:40 -0400 |
---|---|---|
committer | Kevin Newton <kddnewton@gmail.com> | 2023-10-13 15:31:30 -0400 |
commit | fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch) | |
tree | bf98c1898db99a2d35aa759c98dfb259f02055a5 | |
parent | e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff) |
[ruby/prism] Properly handle unescaping in regexp
https://github.com/ruby/prism/commit/abf9fd6863
-rw-r--r-- | prism/prism.c | 479 | ||||
-rw-r--r-- | test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt | 4 | ||||
-rw-r--r-- | test/prism/snapshots/regex.txt | 12 | ||||
-rw-r--r-- | test/prism/snapshots/seattlerb/bug190.txt | 2 | ||||
-rw-r--r-- | test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt | 2 | ||||
-rw-r--r-- | test/prism/snapshots/seattlerb/regexp_esc_u.txt | 2 | ||||
-rw-r--r-- | test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt | 4 | ||||
-rw-r--r-- | test/prism/snapshots/spanning_heredoc.txt | 4 | ||||
-rw-r--r-- | test/prism/snapshots/unescaping.txt | 2 | ||||
-rw-r--r-- | test/prism/snapshots/unparser/corpus/literal/literal.txt | 8 | ||||
-rw-r--r-- | test/prism/snapshots/unparser/corpus/semantic/literal.txt | 2 | ||||
-rw-r--r-- | test/prism/snapshots/whitequark/parser_bug_830.txt | 2 | ||||
-rw-r--r-- | test/prism/unescape_test.rb | 72 |
13 files changed, 317 insertions, 278 deletions
diff --git a/prism/prism.c b/prism/prism.c index 51afe730aa..e2c4a82394 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -4001,9 +4001,10 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) { return node; } -// Allocate a new RegularExpressionNode node. +// Allocate a new initialize a new RegularExpressionNode node with the given +// unescaped string. static pm_regular_expression_node_t * -pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { +pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { pm_regular_expression_node_t *node = PM_ALLOC_NODE(parser, pm_regular_expression_node_t); *node = (pm_regular_expression_node_t) { @@ -4018,12 +4019,18 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), .content_loc = PM_LOCATION_TOKEN_VALUE(content), .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .unescaped = PM_EMPTY_STRING + .unescaped = *unescaped }; return node; } +// Allocate a new initialize a new RegularExpressionNode node. +static inline pm_regular_expression_node_t * +pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { + return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING); +} + // Allocate a new RequiredDestructuredParameterNode node. static pm_required_destructured_parameter_node_t * pm_required_destructured_parameter_node_create(pm_parser_t *parser, const pm_token_t *opening) { @@ -4472,12 +4479,20 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, return node; } -// Allocate a new SymbolNode node. +// Allocate and initialize a new SymbolNode node. static inline pm_symbol_node_t * pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_EMPTY_STRING); } +// Allocate and initialize a new SymbolNode node with the current string. +static pm_symbol_node_t * +pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { + pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string); + parser->current_string = PM_EMPTY_STRING; + return node; +} + // Allocate and initialize a new SymbolNode node from a label. static pm_symbol_node_t * pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { @@ -6097,6 +6112,7 @@ static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0; static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1; static const uint8_t PM_ESCAPE_FLAG_META = 0x2; static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4; +static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8; // This is a lookup table for whether or not an ASCII character is printable. static const bool ascii_printable_chars[] = { @@ -6168,6 +6184,43 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st } } +// The regular expression engine doesn't support the same escape sequences as +// Ruby does. So first we have to read the escape sequence, and then we have to +// format it like the regular expression engine expects it. For example, in Ruby +// if we have: +// +// /\M-\C-?/ +// +// then the first byte is actually 255, so we have to rewrite this as: +// +// /\xFF/ +// +// Note that in this case there is a literal \ byte in the regular expression +// source so that the regular expression engine will perform its own unescaping. +static inline void +escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) { + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2); + + uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF); + uint8_t byte2 = (uint8_t) (byte & 0xF); + + if (byte1 >= 0xA) { + pm_buffer_append_u8(buffer, (uint8_t) ((byte1 - 0xA) + 'A')); + } else { + pm_buffer_append_u8(buffer, (uint8_t) (byte1 + '0')); + } + + if (byte2 >= 0xA) { + pm_buffer_append_u8(buffer, (uint8_t) (byte2 - 0xA + 'A')); + } else { + pm_buffer_append_u8(buffer, (uint8_t) (byte2 + '0')); + } + } else { + pm_buffer_append_u8(buffer, byte); + } +} + // Read the value of an escape into the buffer. static void escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { @@ -6245,6 +6298,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } case 'x': { + const uint8_t *start = parser->current.end - 1; + parser->current.end++; uint8_t byte = peek(parser); @@ -6258,7 +6313,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; } - pm_buffer_append_u8(buffer, value); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_buffer_append_u8(buffer, value); + } } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); } @@ -6266,6 +6325,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { return; } case 'u': { + const uint8_t *start = parser->current.end - 1; parser->current.end++; if ( @@ -6276,7 +6336,13 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { pm_char_is_hexadecimal_digit(parser->current.end[3]) ) { uint32_t value = escape_unicode(parser->current.end, 4); - escape_write_unicode(parser, buffer, parser->current.end, parser->current.end + 4, value); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start)); + } else { + escape_write_unicode(parser, buffer, start, parser->current.end + 4, value); + } + parser->current.end += 4; } else if (peek(parser) == '{') { const uint8_t *unicode_codepoints_start = parser->current.end - 2; @@ -6307,7 +6373,18 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } uint32_t value = escape_unicode(unicode_start, hexadecimal_length); - escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + if (codepoints_count == 1) { + pm_buffer_append_bytes(buffer, (const uint8_t *) "\\u{", 3); + } else { + pm_buffer_append_u8(buffer, ' '); + } + pm_buffer_append_bytes(buffer, unicode_start, hexadecimal_length); + } else { + escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value); + } + parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); } @@ -6318,6 +6395,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { if (peek(parser) == '}') { parser->current.end++; + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_u8(buffer, '}'); + } } else { pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); } @@ -6332,10 +6413,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { uint8_t peeked = peek(parser); switch (peeked) { - case '?': + case '?': { parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(0x7f, flags)); + escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); return; + } case '\\': if (flags & PM_ESCAPE_FLAG_CONTROL) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); @@ -6344,14 +6426,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; - default: + default: { if (!char_is_ascii_printable(peeked)) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); return; } + parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; + } } } case 'C': { @@ -6365,10 +6449,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { uint8_t peeked = peek(parser); switch (peeked) { - case '?': + case '?': { parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(0x7f, flags)); + escape_write_byte(buffer, flags, escape_byte(0x7f, flags)); return; + } case '\\': if (flags & PM_ESCAPE_FLAG_CONTROL) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT); @@ -6377,14 +6462,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { parser->current.end++; escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL); return; - default: + default: { if (!char_is_ascii_printable(peeked)) { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL); return; } + parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; + } } } case 'M': { @@ -6413,7 +6500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) { } parser->current.end++; - pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); + escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); return; } case '\r': { @@ -8106,6 +8193,7 @@ parser_lex(pm_parser_t *parser) { // characters. const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints; const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + pm_token_buffer_t token_buffer = { 0 }; while (breakpoint != NULL) { // If we hit a null byte, skip directly past it. @@ -8150,11 +8238,12 @@ parser_lex(pm_parser_t *parser) { // first. if (breakpoint > parser->current.start) { parser->current.end = breakpoint; + pm_token_buffer_flush(parser, &token_buffer); LEX(PM_TOKEN_STRING_CONTENT); } - // Since we've hit the terminator of the regular expression, we now - // need to parse the options. + // Since we've hit the terminator of the regular expression, + // we now need to parse the options. parser->current.end = breakpoint + 1; parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); @@ -8167,44 +8256,77 @@ parser_lex(pm_parser_t *parser) { // literally. In this case we'll skip past the next character // and find the next breakpoint. if (*breakpoint == '\\') { - size_t difference = pm_unescape_calculate_difference(parser, breakpoint, PM_UNESCAPE_ALL); - if (difference == 0) { - // we're at the end of the file - breakpoint = NULL; - continue; - } + parser->current.end = breakpoint + 1; + pm_token_buffer_escape(parser, &token_buffer); - // If the result is an escaped newline ... - if (breakpoint[difference - 1] == '\n') { - if (parser->heredoc_end) { - // ... if we are on the same line as a heredoc, flush the heredoc and - // continue parsing after heredoc_end. - parser->current.end = breakpoint + difference; - parser_flush_heredoc_end(parser); - LEX(PM_TOKEN_STRING_CONTENT); - } else { - // ... else track the newline. - pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + uint8_t peeked = peek(parser); + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, '\r'); + break; + } + /* fallthrough */ + case '\n': + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); + } + + parser->current.end++; + break; + case 'c': + case 'C': + case 'M': + case 'u': + case 'x': + escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP); + break; + default: + if (lex_mode->as.regexp.terminator == '/' && peeked == '/') { + pm_token_buffer_push(&token_buffer, peeked); + parser->current.end++; + break; + } + + if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\'); + pm_token_buffer_push(&token_buffer, peeked); + parser->current.end++; + break; } - breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); + token_buffer.cursor = parser->current.end; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); continue; } // If we hit a #, then we will attempt to lex interpolation. if (*breakpoint == '#') { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type != PM_TOKEN_NOT_PROVIDED) { - LEX(type); + + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class or + // instance variable like "#@" but wasn't actually. In + // this case we'll just skip to the next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + continue; } - // If we haven't returned at this point then we had - // something that looked like an interpolated class or - // instance variable like "#@" but wasn't actually. In this - // case we'll just skip to the next breakpoint. - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); - continue; + if (type == PM_TOKEN_STRING_CONTENT) { + pm_token_buffer_flush(parser, &token_buffer); + } + + LEX(type); } // If we've hit the incrementor, then we need to skip past it @@ -8713,34 +8835,6 @@ parser_lex(pm_parser_t *parser) { /* Parse functions */ /******************************************************************************/ -// When we are parsing certain content, we need to unescape the content to -// provide to the consumers of the parser. The following functions accept a range -// of characters from the source and unescapes into the provided type. -// -// We have functions for unescaping regular expression nodes, string nodes, -// symbol nodes, and xstring nodes -static pm_regular_expression_node_t * -pm_regular_expression_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) { - pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, opening, content, closing); - - assert((content->end - content->start) >= 0); - pm_string_shared_init(&node->unescaped, content->start, content->end); - - pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type); - return node; -} - -static pm_string_node_t * -pm_string_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) { - pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing); - - assert((content->end - content->start) >= 0); - pm_string_shared_init(&node->unescaped, content->start, content->end); - - pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type); - return node; -} - // These are the various precedence rules. Because we are using a Pratt parser, // they are named binding power to represent the manner in which nodes are bound // together in the stack. @@ -10785,25 +10879,12 @@ parse_string_part(pm_parser_t *parser) { // "aaa #{bbb} #@ccc ddd" // ^^^^ ^ ^^^^ case PM_TOKEN_STRING_CONTENT: { - pm_unescape_type_t unescape_type = PM_UNESCAPE_ALL; - - if (parser->lex_modes.current->mode == PM_LEX_HEREDOC) { - if (parser->lex_modes.current->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) { - // If we're in a tilde heredoc, we want to unescape it later - // because we don't want unescaped newlines to disappear - // before we handle them in the dedent. - unescape_type = PM_UNESCAPE_NONE; - } else if (parser->lex_modes.current->as.heredoc.quote == PM_HEREDOC_QUOTE_SINGLE) { - unescape_type = PM_UNESCAPE_MINIMAL; - } - } - - parser_lex(parser); - pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); + pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - return (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, unescape_type); + parser_lex(parser); + return node; } // Here the lexer has returned the beginning of an embedded expression. In // that case we'll parse the inner statements and return that as the part. @@ -10946,15 +11027,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s } // Now we can parse the first part of the symbol. - pm_node_t *part; - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } + pm_node_t *part = parse_string_part(parser); // If we got a string part, then it's possible that we could transform // what looks like an interpolated symbol into a regular symbol. @@ -10971,16 +11044,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s if (part) pm_node_list_append(&node_list, part); while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&node_list, part); } } @@ -12026,10 +12090,7 @@ parse_strings(pm_parser_t *parser) { pm_node_list_append(&parts, part); while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing)); - parser_lex(parser); - } else if ((part = parse_string_part(parser)) != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&parts, part); } } @@ -12046,15 +12107,10 @@ parse_strings(pm_parser_t *parser) { // string content, in which case we need to parse the string as an // interpolated string. pm_node_list_t parts = PM_EMPTY_NODE_LIST; - pm_token_t string_opening = not_provided(parser); - pm_token_t string_closing = not_provided(parser); - pm_node_t *part = NULL; + pm_node_t *part; while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing)); - parser_lex(parser); - } else if ((part = parse_string_part(parser)) != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_node_list_append(&parts, part); } } @@ -12529,103 +12585,92 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { } node->location.end = opening.end; + } else if ((part = parse_string_part(parser)) == NULL) { + // If we get here, then we tried to find something in the + // heredoc but couldn't actually parse anything, so we'll just + // return a missing node. + node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + // If we get here, then the part that we parsed was plain string + // content and we're at the end of the heredoc, so we can return + // just a string node with the heredoc opening and closing as + // its opening and closing. + pm_string_node_t *cast = (pm_string_node_t *) part; + + cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); + cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); + cast->base.location = cast->opening_loc; + + if (quote == PM_HEREDOC_QUOTE_BACKTICK) { + assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t)); + cast->base.type = PM_X_STRING_NODE; + } + + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; + if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + } + + node = (pm_node_t *) cast; + lex_state_set(parser, PM_LEX_STATE_END); + expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); } else { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part == NULL) { - // If we get here, then we tried to find something in the - // heredoc but couldn't actually parse anything, so we'll just - // return a missing node. - node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); - } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { - // If we get here, then the part that we parsed was plain string - // content and we're at the end of the heredoc, so we can return - // just a string node with the heredoc opening and closing as - // its opening and closing. - pm_string_node_t *cast = (pm_string_node_t *) part; - - cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); - cast->base.location = cast->opening_loc; + // If we get here, then we have multiple parts in the heredoc, + // so we'll need to create an interpolated string node to hold + // them all. + pm_node_list_t parts = PM_EMPTY_NODE_LIST; + pm_node_list_append(&parts, part); - if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t)); - cast->base.type = PM_X_STRING_NODE; + while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + parser_lex(parser); + } else { + part = parse_string_part(parser); } - size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; - if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { - parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + if (part != NULL) { + pm_node_list_append(&parts, part); } + } + + // Now that we have all of the parts, create the correct type of + // interpolated node. + if (quote == PM_HEREDOC_QUOTE_BACKTICK) { + pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); + cast->parts = parts; - node = (pm_node_t *) cast; lex_state_set(parser, PM_LEX_STATE_END); expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); + + pm_interpolated_xstring_node_closing_set(cast, &parser->previous); + cast->base.location = cast->opening_loc; + node = (pm_node_t *) cast; } else { - // If we get here, then we have multiple parts in the heredoc, - // so we'll need to create an interpolated string node to hold - // them all. - pm_node_list_t parts = PM_EMPTY_NODE_LIST; - pm_node_list_append(&parts, part); + pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } + lex_state_set(parser, PM_LEX_STATE_END); + expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - if (part != NULL) { - pm_node_list_append(&parts, part); - } - } + pm_interpolated_string_node_closing_set(cast, &parser->previous); + cast->base.location = cast->opening_loc; + node = (pm_node_t *) cast; + } - // Now that we have all of the parts, create the correct type of - // interpolated node. + // If this is a heredoc that is indented with a ~, then we need + // to dedent each line by the common leading whitespace. + size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; + if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { + pm_node_list_t *nodes; if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening); - cast->parts = parts; - - lex_state_set(parser, PM_LEX_STATE_END); - expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - - pm_interpolated_xstring_node_closing_set(cast, &parser->previous); - cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + nodes = &((pm_interpolated_x_string_node_t *) node)->parts; } else { - pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - - lex_state_set(parser, PM_LEX_STATE_END); - expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM); - - pm_interpolated_string_node_closing_set(cast, &parser->previous); - cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + nodes = &((pm_interpolated_string_node_t *) node)->parts; } - // If this is a heredoc that is indented with a ~, then we need - // to dedent each line by the common leading whitespace. - size_t common_whitespace = lex_mode->as.heredoc.common_whitespace; - if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { - pm_node_list_t *nodes; - if (quote == PM_HEREDOC_QUOTE_BACKTICK) { - nodes = &((pm_interpolated_x_string_node_t *) node)->parts; - } else { - nodes = &((pm_interpolated_string_node_t *) node)->parts; - } - - parse_heredoc_dedent(parser, nodes, common_whitespace); - } + parse_heredoc_dedent(parser, nodes, common_whitespace); } } @@ -13586,11 +13631,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { if (match1(parser, PM_TOKEN_STRING_CONTENT)) { pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - - pm_symbol_node_t *symbol = (pm_symbol_node_t *) pm_symbol_node_create(parser, &opening, &parser->current, &closing); - symbol->unescaped = parser->current_string; - - pm_array_node_elements_append(array, (pm_node_t *) symbol); + pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)); } expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT); @@ -13633,11 +13674,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // If we hit content and the current node is NULL, then this is // the first string content we've seen. In that case we're going // to create a new string node and set that to the current. - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->current, &closing); - symbol->unescaped = parser->current_string; - + current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing); parser_lex(parser); - current = (pm_node_t *) symbol; } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { // If we hit string content and the current node is an // interpolated string, then we need to append the string content @@ -13930,6 +13968,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // expression at least has something in it. We'll need to check if the // following token is the end (in which case we can return a plain // regular expression) or if it's not then it has interpolation. + pm_string_t unescaped = parser->current_string; pm_token_t content = parser->current; parser_lex(parser); @@ -13937,7 +13976,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // without interpolation, which can be represented more succinctly and // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { - return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL); + return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); } // If we get here, then we have interpolation so we'll need to create @@ -13946,7 +13985,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL); + pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); pm_interpolated_regular_expression_node_append(node, part); } else { // If the first part of the body of the regular expression is not a @@ -13957,9 +13996,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { // Now that we're here and we have interpolation, we'll parse all of the // parts into the list. + pm_node_t *part; while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { - pm_node_t *part = parse_string_part(parser); - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_interpolated_regular_expression_node_append(node, part); } } @@ -14023,19 +14062,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) { node = pm_interpolated_xstring_node_create(parser, &opening, &opening); } + pm_node_t *part; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - pm_node_t *part = NULL; - - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else { - part = parse_string_part(parser); - } - - if (part != NULL) { + if ((part = parse_string_part(parser)) != NULL) { pm_interpolated_xstring_node_append(node, part); } } diff --git a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt index e158069bb6..e9bb768383 100644 --- a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt +++ b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt @@ -21,7 +21,7 @@ │ │ │ ├── opening_loc: (1,15)-(1,16) = "/" │ │ │ ├── content_loc: (1,16)-(1,20) = "^\\s{" │ │ │ ├── closing_loc: (1,20)-(1,21) = "/" - │ │ │ ├── unescaped: "^ {" + │ │ │ ├── unescaped: "^\\s{" │ │ │ └── flags: ∅ │ │ └── @ StringNode (location: (1,23)-(1,25)) │ │ ├── flags: ∅ @@ -51,7 +51,7 @@ │ │ ├── opening_loc: (5,15)-(5,16) = "/" │ │ ├── content_loc: (5,16)-(5,20) = "^\\s{" │ │ ├── closing_loc: (5,20)-(5,21) = "/" - │ │ ├── unescaped: "^ {" + │ │ ├── unescaped: "^\\s{" │ │ └── flags: ∅ │ └── @ StringNode (location: (5,23)-(5,25)) │ ├── flags: ∅ diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt index ff0e3d3b56..5fa07265a3 100644 --- a/test/prism/snapshots/regex.txt +++ b/test/prism/snapshots/regex.txt @@ -31,7 +31,7 @@ │ ├── opening_loc: (5,0)-(5,1) = "/" │ ├── content_loc: (5,1)-(5,4) = "a\\b" │ ├── closing_loc: (5,4)-(5,5) = "/" - │ ├── unescaped: "a\b" + │ ├── unescaped: "a\\b" │ └── flags: ∅ ├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11)) │ ├── opening_loc: (7,0)-(7,1) = "/" @@ -130,25 +130,25 @@ │ ├── opening_loc: (15,0)-(15,3) = "%r/" │ ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:" │ ├── closing_loc: (15,24)-(15,26) = "/i" - │ ├── unescaped: "[a-z$._?][w$.?\#@~]*:" + │ ├── unescaped: "[a-z$._?][\\w$.?\#@~]*:" │ └── flags: ignore_case ├── @ RegularExpressionNode (location: (17,0)-(17,37)) │ ├── opening_loc: (17,0)-(17,3) = "%r/" │ ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)" │ ├── closing_loc: (17,35)-(17,37) = "/i" - │ ├── unescaped: "([a-z$._?][w$.?\#@~]*)( +)(equ)" + │ ├── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)" │ └── flags: ignore_case ├── @ RegularExpressionNode (location: (19,0)-(19,25)) │ ├── opening_loc: (19,0)-(19,3) = "%r/" │ ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*" │ ├── closing_loc: (19,23)-(19,25) = "/i" - │ ├── unescaped: "[a-z$._?][w$.?\#@~]*" + │ ├── unescaped: "[a-z$._?][\\w$.?\#@~]*" │ └── flags: ignore_case ├── @ RegularExpressionNode (location: (21,0)-(24,1)) │ ├── opening_loc: (21,0)-(21,3) = "%r(" │ ├── content_loc: (21,3)-(23,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n (?:[\\w\#$%_']+)\n" │ ├── closing_loc: (24,0)-(24,1) = ")" - │ ├── unescaped: "\n(?:[w\#$%_']|()|(,)|[]|[0-9])*\n (?:[w\#$%_']+)\n" + │ ├── unescaped: "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n (?:[\\w\#$%_']+)\n" │ └── flags: ∅ ├── @ CallNode (location: (26,0)-(26,16)) │ ├── receiver: @@ -156,7 +156,7 @@ │ │ ├── opening_loc: (26,0)-(26,1) = "/" │ │ ├── content_loc: (26,1)-(26,7) = "(?#\\))" │ │ ├── closing_loc: (26,7)-(26,8) = "/" - │ │ ├── unescaped: "(?#))" + │ │ ├── unescaped: "(?#\\))" │ │ └── flags: ∅ │ ├── call_operator_loc: ∅ │ ├── message_loc: (26,9)-(26,11) = "=~" diff --git a/test/prism/snapshots/seattlerb/bug190.txt b/test/prism/snapshots/seattlerb/bug190.txt index 527304835a..fec48914c9 100644 --- a/test/prism/snapshots/seattlerb/bug190.txt +++ b/test/prism/snapshots/seattlerb/bug190.txt @@ -7,5 +7,5 @@ ├── opening_loc: (1,0)-(1,3) = "%r'" ├── content_loc: (1,3)-(1,5) = "\\'" ├── closing_loc: (1,5)-(1,6) = "'" - ├── unescaped: "'" + ├── unescaped: "\\'" └── flags: ∅ diff --git a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt index 3bc991033c..caf67b892d 100644 --- a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt +++ b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt @@ -7,5 +7,5 @@ ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,6) = "\\cC\\d" ├── closing_loc: (1,6)-(1,7) = "/" - ├── unescaped: "\u0003d" + ├── unescaped: "\\x03\\d" └── flags: ∅ diff --git a/test/prism/snapshots/seattlerb/regexp_esc_u.txt b/test/prism/snapshots/seattlerb/regexp_esc_u.txt index adbfe36880..ea6bbb6141 100644 --- a/test/prism/snapshots/seattlerb/regexp_esc_u.txt +++ b/test/prism/snapshots/seattlerb/regexp_esc_u.txt @@ -7,5 +7,5 @@ ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,16) = "[\\u0021-\\u0027]" ├── closing_loc: (1,16)-(1,17) = "/" - ├── unescaped: "[!-']" + ├── unescaped: "[\\u0021-\\u0027]" └── flags: ∅ diff --git a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt index 5e039bd16e..74e8b52787 100644 --- a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt +++ b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt @@ -7,11 +7,11 @@ │ ├── opening_loc: (1,0)-(1,1) = "/" │ ├── content_loc: (1,1)-(1,14) = "\\u{c0de babe}" │ ├── closing_loc: (1,14)-(1,15) = "/" - │ ├── unescaped: "샞몾" + │ ├── unescaped: "\\u{c0de babe}" │ └── flags: ∅ └── @ RegularExpressionNode (location: (3,0)-(3,8)) ├── opening_loc: (3,0)-(3,1) = "/" ├── content_loc: (3,1)-(3,7) = "\\u{df}" ├── closing_loc: (3,7)-(3,8) = "/" - ├── unescaped: "ß" + ├── unescaped: "\\u{df}" └── flags: ∅ diff --git a/test/prism/snapshots/spanning_heredoc.txt b/test/prism/snapshots/spanning_heredoc.txt index 2c59cb4368..6b3e3c92d7 100644 --- a/test/prism/snapshots/spanning_heredoc.txt +++ b/test/prism/snapshots/spanning_heredoc.txt @@ -28,10 +28,10 @@ │ │ │ ├── @ InterpolatedRegularExpressionNode (location: (4,13)-(7,2)) │ │ │ │ ├── opening_loc: (4,13)-(4,14) = "/" │ │ │ │ ├── parts: (length: 2) - │ │ │ │ │ ├── @ StringNode (location: (4,14)-(4,0)) + │ │ │ │ │ ├── @ StringNode (location: (4,14)-(4,16)) │ │ │ │ │ │ ├── flags: ∅ │ │ │ │ │ │ ├── opening_loc: ∅ - │ │ │ │ │ │ ├── content_loc: (4,14)-(4,0) = "b\\\n" + │ │ │ │ │ │ ├── content_loc: (4,14)-(4,16) = "b\\" │ │ │ │ │ │ ├── closing_loc: ∅ │ │ │ │ │ │ └── unescaped: "b" │ │ │ │ │ └── @ StringNode (location: (7,0)-(7,1)) diff --git a/test/prism/snapshots/unescaping.txt b/test/prism/snapshots/unescaping.txt index a59dc01626..ee7c3759cb 100644 --- a/test/prism/snapshots/unescaping.txt +++ b/test/prism/snapshots/unescaping.txt @@ -17,7 +17,7 @@ │ ├── opening_loc: (3,0)-(3,1) = "/" │ ├── content_loc: (3,1)-(3,7) = "\\c\#{1}" │ ├── closing_loc: (3,7)-(3,8) = "/" - │ ├── unescaped: "\u0003{1}" + │ ├── unescaped: "\\x03{1}" │ └── flags: ∅ ├── @ StringNode (location: (5,0)-(5,8)) │ ├── flags: ∅ diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt index 7c477382dc..21e73552ef 100644 --- a/test/prism/snapshots/unparser/corpus/literal/literal.txt +++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt @@ -545,7 +545,7 @@ │ ├── opening_loc: (50,0)-(50,1) = "/" │ ├── content_loc: (50,1)-(50,27) = "[^-+',.\\/:@[:alnum:]\\[\\]]+" │ ├── closing_loc: (50,27)-(50,28) = "/" - │ ├── unescaped: "[^-+',./:@[:alnum:][]]+" + │ ├── unescaped: "[^-+',./:@[:alnum:]\\[\\]]+" │ └── flags: ∅ ├── @ InterpolatedRegularExpressionNode (location: (51,0)-(51,12)) │ ├── opening_loc: (51,0)-(51,1) = "/" @@ -606,19 +606,19 @@ │ ├── opening_loc: (54,0)-(54,1) = "/" │ ├── content_loc: (54,1)-(54,3) = "\\n" │ ├── closing_loc: (54,3)-(54,4) = "/" - │ ├── unescaped: "\n" + │ ├── unescaped: "\\n" │ └── flags: ∅ ├── @ RegularExpressionNode (location: (55,0)-(55,4)) │ ├── opening_loc: (55,0)-(55,1) = "/" │ ├── content_loc: (55,1)-(55,3) = "\\n" │ ├── closing_loc: (55,3)-(55,4) = "/" - │ ├── unescaped: "\n" + │ ├── unescaped: "\\n" │ └── flags: ∅ ├── @ RegularExpressionNode (location: (56,0)-(56,5)) │ ├── opening_loc: (56,0)-(56,1) = "/" │ ├── content_loc: (56,1)-(56,3) = "\\n" │ ├── closing_loc: (56,3)-(56,5) = "/x" - │ ├── unescaped: "\n" + │ ├── unescaped: "\\n" │ └── flags: extended ├── @ RegularExpressionNode (location: (57,0)-(57,7)) │ ├── opening_loc: (57,0)-(57,1) = "/" diff --git a/test/prism/snapshots/unparser/corpus/semantic/literal.txt b/test/prism/snapshots/unparser/corpus/semantic/literal.txt index c79d0370da..6da3b56f33 100644 --- a/test/prism/snapshots/unparser/corpus/semantic/literal.txt +++ b/test/prism/snapshots/unparser/corpus/semantic/literal.txt @@ -33,7 +33,7 @@ │ ├── opening_loc: (10,0)-(10,3) = "%r(" │ ├── content_loc: (10,3)-(10,5) = "\\)" │ ├── closing_loc: (10,5)-(10,6) = ")" - │ ├── unescaped: ")" + │ ├── unescaped: "\\)" │ └── flags: ∅ ├── @ InterpolatedRegularExpressionNode (location: (11,0)-(11,14)) │ ├── opening_loc: (11,0)-(11,3) = "%r(" diff --git a/test/prism/snapshots/whitequark/parser_bug_830.txt b/test/prism/snapshots/whitequark/parser_bug_830.txt index f19fffbba0..e380113372 100644 --- a/test/prism/snapshots/whitequark/parser_bug_830.txt +++ b/test/prism/snapshots/whitequark/parser_bug_830.txt @@ -7,5 +7,5 @@ ├── opening_loc: (1,0)-(1,1) = "/" ├── content_loc: (1,1)-(1,3) = "\\(" ├── closing_loc: (1,3)-(1,4) = "/" - ├── unescaped: "(" + ├── unescaped: "\\(" └── flags: ∅ diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb index 123c139077..051b5e29d1 100644 --- a/test/prism/unescape_test.rb +++ b/test/prism/unescape_test.rb @@ -108,40 +108,50 @@ module Prism escapes = [*ascii, *ascii8, *newlines, *octal, *hex2, *hex4, *hex6, *ctrls] contexts = [ - [Context::String.new("?", ""), escapes], - [Context::String.new("'", "'"), escapes], - [Context::String.new("\"", "\""), escapes], - [Context::String.new("%q[", "]"), escapes], - [Context::String.new("%Q[", "]"), escapes], - [Context::String.new("%[", "]"), escapes], - [Context::String.new("`", "`"), escapes], - [Context::String.new("%x[", "]"), escapes], - [Context::String.new("<<H\n", "\nH"), escapes], - [Context::String.new("<<'H'\n", "\nH"), escapes], - [Context::String.new("<<\"H\"\n", "\nH"), escapes], - [Context::String.new("<<`H`\n", "\nH"), escapes], - [Context::String.new("<<-H\n", "\nH"), escapes], - [Context::String.new("<<-'H'\n", "\nH"), escapes], - [Context::String.new("<<-\"H\"\n", "\nH"), escapes], - [Context::String.new("<<-`H`\n", "\nH"), escapes], - [Context::Heredoc.new("<<~H\n", "\nH"), escapes], - [Context::Heredoc.new("<<~'H'\n", "\nH"), escapes], - [Context::Heredoc.new("<<~\"H\"\n", "\nH"), escapes], - [Context::Heredoc.new("<<~`H`\n", "\nH"), escapes], - [Context::List.new("%w[", "]"), escapes], - [Context::List.new("%W[", "]"), escapes], - [Context::List.new("%i[", "]"), escapes], - [Context::List.new("%I[", "]"), escapes], - [Context::Symbol.new("%s[", "]"), escapes], - [Context::Symbol.new(":'", "'"), escapes], - [Context::Symbol.new(":\"", "\""), escapes], - # [Context::RegExp.new("/", "/"), escapes], - # [Context::RegExp.new("%r[", "]"), escapes] + Context::String.new("?", ""), + Context::String.new("'", "'"), + Context::String.new("\"", "\""), + Context::String.new("%q[", "]"), + Context::String.new("%Q[", "]"), + Context::String.new("%[", "]"), + Context::String.new("`", "`"), + Context::String.new("%x[", "]"), + Context::String.new("<<H\n", "\nH"), + Context::String.new("<<'H'\n", "\nH"), + Context::String.new("<<\"H\"\n", "\nH"), + Context::String.new("<<`H`\n", "\nH"), + Context::String.new("<<-H\n", "\nH"), + Context::String.new("<<-'H'\n", "\nH"), + Context::String.new("<<-\"H\"\n", "\nH"), + Context::String.new("<<-`H`\n", "\nH"), + Context::Heredoc.new("<<~H\n", "\nH"), + Context::Heredoc.new("<<~'H'\n", "\nH"), + Context::Heredoc.new("<<~\"H\"\n", "\nH"), + Context::Heredoc.new("<<~`H`\n", "\nH"), + Context::List.new("%w[", "]"), + Context::List.new("%w<", ">"), + Context::List.new("%W[", "]"), + Context::List.new("%i[", "]"), + Context::List.new("%I[", "]"), + Context::Symbol.new("%s[", "]"), + Context::Symbol.new(":'", "'"), + Context::Symbol.new(":\"", "\""), + Context::RegExp.new("/", "/"), + Context::RegExp.new("%r[", "]"), + Context::RegExp.new("%r<", ">"), + Context::RegExp.new("%r{", "}"), + Context::RegExp.new("%r(", ")"), + Context::RegExp.new("%r|", "|"), ] - contexts.each do |(context, escapes)| + contexts.each do |context| escapes.each do |escape| - next if context.name == "?" && escape == "\xFF".b # wat? + # I think this might be a bug in Ruby. + next if context.name == "?" && escape == "\xFF".b + + # We don't currently support scanning for the number of capture groups, + # so these are all going to fail. + next if (context.name == "//" || context.name.start_with?("%r")) && escape.start_with?(/\d/) define_method(:"test_#{context.name}_#{escape.inspect}") do assert_unescape(context, escape) |