[ruby/prism] Properly handle unescaping in regexp

https://github.com/ruby/prism/commit/abf9fd6863
author: Kevin Newton <kddnewton@gmail.com> 2023-10-12 08:46:40 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2023-10-13 15:31:30 -0400
commit: fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch)
tree: bf98c1898db99a2d35aa759c98dfb259f02055a5
parent: e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff)
13 files changed, 317 insertions, 278 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 51afe730aa..e2c4a82394 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -4001,9 +4001,10 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) {
     return node;
 }
 
-// Allocate a new RegularExpressionNode node.
+// Allocate a new initialize a new RegularExpressionNode node with the given
+// unescaped string.
 static pm_regular_expression_node_t *
-pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
+pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) {
     pm_regular_expression_node_t *node = PM_ALLOC_NODE(parser, pm_regular_expression_node_t);
 
     *node = (pm_regular_expression_node_t) {
@@ -4018,12 +4019,18 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening
         .opening_loc = PM_LOCATION_TOKEN_VALUE(opening),
         .content_loc = PM_LOCATION_TOKEN_VALUE(content),
         .closing_loc = PM_LOCATION_TOKEN_VALUE(closing),
-        .unescaped = PM_EMPTY_STRING
+        .unescaped = *unescaped
     };
 
     return node;
 }
 
+// Allocate a new initialize a new RegularExpressionNode node.
+static inline pm_regular_expression_node_t *
+pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) {
+    return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_EMPTY_STRING);
+}
+
 // Allocate a new RequiredDestructuredParameterNode node.
 static pm_required_destructured_parameter_node_t *
 pm_required_destructured_parameter_node_create(pm_parser_t *parser, const pm_token_t *opening) {
@@ -4472,12 +4479,20 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
     return node;
 }
 
-// Allocate a new SymbolNode node.
+// Allocate and initialize a new SymbolNode node.
 static inline pm_symbol_node_t *
 pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
     return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_EMPTY_STRING);
 }
 
+// Allocate and initialize a new SymbolNode node with the current string.
+static pm_symbol_node_t *
+pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
+    pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string);
+    parser->current_string = PM_EMPTY_STRING;
+    return node;
+}
+
 // Allocate and initialize a new SymbolNode node from a label.
 static pm_symbol_node_t *
 pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
@@ -6097,6 +6112,7 @@ static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0;
 static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1;
 static const uint8_t PM_ESCAPE_FLAG_META = 0x2;
 static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4;
+static const uint8_t PM_ESCAPE_FLAG_REGEXP = 0x8;
 
 // This is a lookup table for whether or not an ASCII character is printable.
 static const bool ascii_printable_chars[] = {
@@ -6168,6 +6184,43 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st
     }
 }
 
+// The regular expression engine doesn't support the same escape sequences as
+// Ruby does. So first we have to read the escape sequence, and then we have to
+// format it like the regular expression engine expects it. For example, in Ruby
+// if we have:
+//
+//     /\M-\C-?/
+//
+// then the first byte is actually 255, so we have to rewrite this as:
+//
+//     /\xFF/
+//
+// Note that in this case there is a literal \ byte in the regular expression
+// source so that the regular expression engine will perform its own unescaping.
+static inline void
+escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
+    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+        pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
+
+        uint8_t byte1 = (uint8_t) ((byte >> 4) & 0xF);
+        uint8_t byte2 = (uint8_t) (byte & 0xF);
+
+        if (byte1 >= 0xA) {
+            pm_buffer_append_u8(buffer, (uint8_t) ((byte1 - 0xA) + 'A'));
+        } else {
+            pm_buffer_append_u8(buffer, (uint8_t) (byte1 + '0'));
+        }
+
+        if (byte2 >= 0xA) {
+            pm_buffer_append_u8(buffer, (uint8_t) (byte2 - 0xA + 'A'));
+        } else {
+            pm_buffer_append_u8(buffer, (uint8_t) (byte2 + '0'));
+        }
+    } else {
+        pm_buffer_append_u8(buffer, byte);
+    }
+}
+
 // Read the value of an escape into the buffer.
 static void
 escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
@@ -6245,6 +6298,8 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             return;
         }
         case 'x': {
+            const uint8_t *start = parser->current.end - 1;
+
             parser->current.end++;
             uint8_t byte = peek(parser);
 
@@ -6258,7 +6313,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     parser->current.end++;
                 }
 
-                pm_buffer_append_u8(buffer, value);
+                if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                    pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
+                } else {
+                    pm_buffer_append_u8(buffer, value);
+                }
             } else {
                 pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
             }
@@ -6266,6 +6325,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             return;
         }
         case 'u': {
+            const uint8_t *start = parser->current.end - 1;
             parser->current.end++;
 
             if (
@@ -6276,7 +6336,13 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                 pm_char_is_hexadecimal_digit(parser->current.end[3])
             ) {
                 uint32_t value = escape_unicode(parser->current.end, 4);
-                escape_write_unicode(parser, buffer, parser->current.end, parser->current.end + 4, value);
+
+                if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                    pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
+                } else {
+                    escape_write_unicode(parser, buffer, start, parser->current.end + 4, value);
+                }
+
                 parser->current.end += 4;
             } else if (peek(parser) == '{') {
                 const uint8_t *unicode_codepoints_start = parser->current.end - 2;
@@ -6307,7 +6373,18 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     }
 
                     uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
-                    escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value);
+
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        if (codepoints_count == 1) {
+                            pm_buffer_append_bytes(buffer, (const uint8_t *) "\\u{", 3);
+                        } else {
+                            pm_buffer_append_u8(buffer, ' ');
+                        }
+                        pm_buffer_append_bytes(buffer, unicode_start, hexadecimal_length);
+                    } else {
+                        escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value);
+                    }
+
                     parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
                 }
 
@@ -6318,6 +6395,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
 
                 if (peek(parser) == '}') {
                     parser->current.end++;
+
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        pm_buffer_append_u8(buffer, '}');
+                    }
                 } else {
                     pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
                 }
@@ -6332,10 +6413,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             uint8_t peeked = peek(parser);
 
             switch (peeked) {
-                case '?':
+                case '?': {
                     parser->current.end++;
-                    pm_buffer_append_u8(buffer, escape_byte(0x7f, flags));
+                    escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
                     return;
+                }
                 case '\\':
                     if (flags & PM_ESCAPE_FLAG_CONTROL) {
                         pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
@@ -6344,14 +6426,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     parser->current.end++;
                     escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
                     return;
-                default:
+                default: {
                     if (!char_is_ascii_printable(peeked)) {
                         pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
                         return;
                     }
+
                     parser->current.end++;
-                    pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                     return;
+                }
             }
         }
         case 'C': {
@@ -6365,10 +6449,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             uint8_t peeked = peek(parser);
 
             switch (peeked) {
-                case '?':
+                case '?': {
                     parser->current.end++;
-                    pm_buffer_append_u8(buffer, escape_byte(0x7f, flags));
+                    escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
                     return;
+                }
                 case '\\':
                     if (flags & PM_ESCAPE_FLAG_CONTROL) {
                         pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
@@ -6377,14 +6462,16 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     parser->current.end++;
                     escape_read(parser, buffer, flags | PM_ESCAPE_FLAG_CONTROL);
                     return;
-                default:
+                default: {
                     if (!char_is_ascii_printable(peeked)) {
                         pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_CONTROL);
                         return;
                     }
+
                     parser->current.end++;
-                    pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                     return;
+                }
             }
         }
         case 'M': {
@@ -6413,7 +6500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             }
 
             parser->current.end++;
-            pm_buffer_append_u8(buffer, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
+            escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
             return;
         }
         case '\r': {
@@ -8106,6 +8193,7 @@ parser_lex(pm_parser_t *parser) {
             // characters.
             const uint8_t *breakpoints = lex_mode->as.regexp.breakpoints;
             const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
+            pm_token_buffer_t token_buffer = { 0 };
 
             while (breakpoint != NULL) {
                 // If we hit a null byte, skip directly past it.
@@ -8150,11 +8238,12 @@ parser_lex(pm_parser_t *parser) {
                     // first.
                     if (breakpoint > parser->current.start) {
                         parser->current.end = breakpoint;
+                        pm_token_buffer_flush(parser, &token_buffer);
                         LEX(PM_TOKEN_STRING_CONTENT);
                     }
 
-                    // Since we've hit the terminator of the regular expression, we now
-                    // need to parse the options.
+                    // Since we've hit the terminator of the regular expression,
+                    // we now need to parse the options.
                     parser->current.end = breakpoint + 1;
                     parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
 
@@ -8167,44 +8256,77 @@ parser_lex(pm_parser_t *parser) {
                 // literally. In this case we'll skip past the next character
                 // and find the next breakpoint.
                 if (*breakpoint == '\\') {
-                    size_t difference = pm_unescape_calculate_difference(parser, breakpoint, PM_UNESCAPE_ALL);
-                    if (difference == 0) {
-                        // we're at the end of the file
-                        breakpoint = NULL;
-                        continue;
-                    }
+                    parser->current.end = breakpoint + 1;
+                    pm_token_buffer_escape(parser, &token_buffer);
 
-                    // If the result is an escaped newline ...
-                    if (breakpoint[difference - 1] == '\n') {
-                        if (parser->heredoc_end) {
-                            // ... if we are on the same line as a heredoc, flush the heredoc and
-                            // continue parsing after heredoc_end.
-                            parser->current.end = breakpoint + difference;
-                            parser_flush_heredoc_end(parser);
-                            LEX(PM_TOKEN_STRING_CONTENT);
-                        } else {
-                            // ... else track the newline.
-                            pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
-                        }
+                    uint8_t peeked = peek(parser);
+                    switch (peeked) {
+                        case '\r':
+                            parser->current.end++;
+                            if (peek(parser) != '\n') {
+                                pm_token_buffer_push(&token_buffer, '\\');
+                                pm_token_buffer_push(&token_buffer, '\r');
+                                break;
+                            }
+                        /* fallthrough */
+                        case '\n':
+                            if (parser->heredoc_end) {
+                                // ... if we are on the same line as a heredoc,
+                                // flush the heredoc and continue parsing after
+                                // heredoc_end.
+                                parser_flush_heredoc_end(parser);
+                                pm_token_buffer_copy(parser, &token_buffer);
+                                LEX(PM_TOKEN_STRING_CONTENT);
+                            } else {
+                                // ... else track the newline.
+                                pm_newline_list_append(&parser->newline_list, parser->current.end);
+                            }
+
+                            parser->current.end++;
+                            break;
+                        case 'c':
+                        case 'C':
+                        case 'M':
+                        case 'u':
+                        case 'x':
+                            escape_read(parser, &token_buffer.buffer, PM_ESCAPE_FLAG_REGEXP);
+                            break;
+                        default:
+                            if (lex_mode->as.regexp.terminator == '/' && peeked == '/') {
+                                pm_token_buffer_push(&token_buffer, peeked);
+                                parser->current.end++;
+                                break;
+                            }
+
+                            if (peeked < 0x80) pm_token_buffer_push(&token_buffer, '\\');
+                            pm_token_buffer_push(&token_buffer, peeked);
+                            parser->current.end++;
+                            break;
                     }
 
-                    breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
+                    token_buffer.cursor = parser->current.end;
+                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
                     continue;
                 }
 
                 // If we hit a #, then we will attempt to lex interpolation.
                 if (*breakpoint == '#') {
                     pm_token_type_t type = lex_interpolation(parser, breakpoint);
-                    if (type != PM_TOKEN_NOT_PROVIDED) {
-                        LEX(type);
+
+                    if (type == PM_TOKEN_NOT_PROVIDED) {
+                        // If we haven't returned at this point then we had
+                        // something that looked like an interpolated class or
+                        // instance variable like "#@" but wasn't actually. In
+                        // this case we'll just skip to the next breakpoint.
+                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
+                        continue;
                     }
 
-                    // If we haven't returned at this point then we had
-                    // something that looked like an interpolated class or
-                    // instance variable like "#@" but wasn't actually. In this
-                    // case we'll just skip to the next breakpoint.
-                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
-                    continue;
+                    if (type == PM_TOKEN_STRING_CONTENT) {
+                        pm_token_buffer_flush(parser, &token_buffer);
+                    }
+
+                    LEX(type);
                 }
 
                 // If we've hit the incrementor, then we need to skip past it
@@ -8713,34 +8835,6 @@ parser_lex(pm_parser_t *parser) {
 /* Parse functions                                                            */
 /******************************************************************************/
 
-// When we are parsing certain content, we need to unescape the content to
-// provide to the consumers of the parser. The following functions accept a range
-// of characters from the source and unescapes into the provided type.
-//
-// We have functions for unescaping regular expression nodes, string nodes,
-// symbol nodes, and xstring nodes
-static pm_regular_expression_node_t *
-pm_regular_expression_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
-    pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, opening, content, closing);
-
-    assert((content->end - content->start) >= 0);
-    pm_string_shared_init(&node->unescaped, content->start, content->end);
-
-    pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
-    return node;
-}
-
-static pm_string_node_t *
-pm_string_node_create_and_unescape(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, pm_unescape_type_t unescape_type) {
-    pm_string_node_t *node = pm_string_node_create(parser, opening, content, closing);
-
-    assert((content->end - content->start) >= 0);
-    pm_string_shared_init(&node->unescaped, content->start, content->end);
-
-    pm_unescape_manipulate_string(parser, &node->unescaped, unescape_type);
-    return node;
-}
-
 // These are the various precedence rules. Because we are using a Pratt parser,
 // they are named binding power to represent the manner in which nodes are bound
 // together in the stack.
@@ -10785,25 +10879,12 @@ parse_string_part(pm_parser_t *parser) {
         //     "aaa #{bbb} #@ccc ddd"
         //      ^^^^      ^     ^^^^
         case PM_TOKEN_STRING_CONTENT: {
-            pm_unescape_type_t unescape_type = PM_UNESCAPE_ALL;
-
-            if (parser->lex_modes.current->mode == PM_LEX_HEREDOC) {
-                if (parser->lex_modes.current->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
-                    // If we're in a tilde heredoc, we want to unescape it later
-                    // because we don't want unescaped newlines to disappear
-                    // before we handle them in the dedent.
-                    unescape_type = PM_UNESCAPE_NONE;
-                } else if (parser->lex_modes.current->as.heredoc.quote == PM_HEREDOC_QUOTE_SINGLE) {
-                    unescape_type = PM_UNESCAPE_MINIMAL;
-                }
-            }
-
-            parser_lex(parser);
-
             pm_token_t opening = not_provided(parser);
             pm_token_t closing = not_provided(parser);
+            pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
 
-            return (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, unescape_type);
+            parser_lex(parser);
+            return node;
         }
         // Here the lexer has returned the beginning of an embedded expression. In
         // that case we'll parse the inner statements and return that as the part.
@@ -10946,15 +11027,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
         }
 
         // Now we can parse the first part of the symbol.
-        pm_node_t *part;
-        if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-            pm_token_t opening = not_provided(parser);
-            pm_token_t closing = not_provided(parser);
-            part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
-            parser_lex(parser);
-        } else {
-            part = parse_string_part(parser);
-        }
+        pm_node_t *part = parse_string_part(parser);
 
         // If we got a string part, then it's possible that we could transform
         // what looks like an interpolated symbol into a regular symbol.
@@ -10971,16 +11044,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
         if (part) pm_node_list_append(&node_list, part);
 
         while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
-            if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                pm_token_t opening = not_provided(parser);
-                pm_token_t closing = not_provided(parser);
-                part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
-                parser_lex(parser);
-            } else {
-                part = parse_string_part(parser);
-            }
-
-            if (part != NULL) {
+            if ((part = parse_string_part(parser)) != NULL) {
                 pm_node_list_append(&node_list, part);
             }
         }
@@ -12026,10 +12090,7 @@ parse_strings(pm_parser_t *parser) {
                 pm_node_list_append(&parts, part);
 
                 while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
-                    if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                        pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing));
-                        parser_lex(parser);
-                    } else if ((part = parse_string_part(parser)) != NULL) {
+                    if ((part = parse_string_part(parser)) != NULL) {
                         pm_node_list_append(&parts, part);
                     }
                 }
@@ -12046,15 +12107,10 @@ parse_strings(pm_parser_t *parser) {
             // string content, in which case we need to parse the string as an
             // interpolated string.
             pm_node_list_t parts = PM_EMPTY_NODE_LIST;
-            pm_token_t string_opening = not_provided(parser);
-            pm_token_t string_closing = not_provided(parser);
-            pm_node_t *part = NULL;
+            pm_node_t *part;
 
             while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
-                if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                    pm_node_list_append(&parts, (pm_node_t *) pm_string_node_create_current_string(parser, &string_opening, &parser->current, &string_closing));
-                    parser_lex(parser);
-                } else if ((part = parse_string_part(parser)) != NULL) {
+                if ((part = parse_string_part(parser)) != NULL) {
                     pm_node_list_append(&parts, part);
                 }
             }
@@ -12529,103 +12585,92 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                 }
 
                 node->location.end = opening.end;
+            } else if ((part = parse_string_part(parser)) == NULL) {
+                // If we get here, then we tried to find something in the
+                // heredoc but couldn't actually parse anything, so we'll just
+                // return a missing node.
+                node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
+            } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
+                // If we get here, then the part that we parsed was plain string
+                // content and we're at the end of the heredoc, so we can return
+                // just a string node with the heredoc opening and closing as
+                // its opening and closing.
+                pm_string_node_t *cast = (pm_string_node_t *) part;
+
+                cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening);
+                cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current);
+                cast->base.location = cast->opening_loc;
+
+                if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
+                    assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t));
+                    cast->base.type = PM_X_STRING_NODE;
+                }
+
+                size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
+                if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
+                    parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
+                }
+
+                node = (pm_node_t *) cast;
+                lex_state_set(parser, PM_LEX_STATE_END);
+                expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
             } else {
-                if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                    pm_token_t opening = not_provided(parser);
-                    pm_token_t closing = not_provided(parser);
-                    part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
-                    parser_lex(parser);
-                } else {
-                    part = parse_string_part(parser);
-                }
-
-                if (part == NULL) {
-                    // If we get here, then we tried to find something in the
-                    // heredoc but couldn't actually parse anything, so we'll just
-                    // return a missing node.
-                    node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end);
-                } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
-                    // If we get here, then the part that we parsed was plain string
-                    // content and we're at the end of the heredoc, so we can return
-                    // just a string node with the heredoc opening and closing as
-                    // its opening and closing.
-                    pm_string_node_t *cast = (pm_string_node_t *) part;
-
-                    cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening);
-                    cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current);
-                    cast->base.location = cast->opening_loc;
+                // If we get here, then we have multiple parts in the heredoc,
+                // so we'll need to create an interpolated string node to hold
+                // them all.
+                pm_node_list_t parts = PM_EMPTY_NODE_LIST;
+                pm_node_list_append(&parts, part);
 
-                    if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
-                        assert(sizeof(pm_string_node_t) == sizeof(pm_x_string_node_t));
-                        cast->base.type = PM_X_STRING_NODE;
+                while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
+                    if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
+                        pm_token_t opening = not_provided(parser);
+                        pm_token_t closing = not_provided(parser);
+                        part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
+                        parser_lex(parser);
+                    } else {
+                        part = parse_string_part(parser);
                     }
 
-                    size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
-                    if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
-                        parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
+                    if (part != NULL) {
+                        pm_node_list_append(&parts, part);
                     }
+                }
+
+                // Now that we have all of the parts, create the correct type of
+                // interpolated node.
+                if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
+                    pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening);
+                    cast->parts = parts;
 
-                    node = (pm_node_t *) cast;
                     lex_state_set(parser, PM_LEX_STATE_END);
                     expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
+
+                    pm_interpolated_xstring_node_closing_set(cast, &parser->previous);
+                    cast->base.location = cast->opening_loc;
+                    node = (pm_node_t *) cast;
                 } else {
-                    // If we get here, then we have multiple parts in the heredoc,
-                    // so we'll need to create an interpolated string node to hold
-                    // them all.
-                    pm_node_list_t parts = PM_EMPTY_NODE_LIST;
-                    pm_node_list_append(&parts, part);
+                    pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening);
 
-                    while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
-                        if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                            pm_token_t opening = not_provided(parser);
-                            pm_token_t closing = not_provided(parser);
-                            part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
-                            parser_lex(parser);
-                        } else {
-                            part = parse_string_part(parser);
-                        }
+                    lex_state_set(parser, PM_LEX_STATE_END);
+                    expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
 
-                        if (part != NULL) {
-                            pm_node_list_append(&parts, part);
-                        }
-                    }
+                    pm_interpolated_string_node_closing_set(cast, &parser->previous);
+                    cast->base.location = cast->opening_loc;
+                    node = (pm_node_t *) cast;
+                }
 
-                    // Now that we have all of the parts, create the correct type of
-                    // interpolated node.
+                // If this is a heredoc that is indented with a ~, then we need
+                // to dedent each line by the common leading whitespace.
+                size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
+                if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
+                    pm_node_list_t *nodes;
                     if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
-                        pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening);
-                        cast->parts = parts;
-
-                        lex_state_set(parser, PM_LEX_STATE_END);
-                        expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
-
-                        pm_interpolated_xstring_node_closing_set(cast, &parser->previous);
-                        cast->base.location = cast->opening_loc;
-                        node = (pm_node_t *) cast;
+                        nodes = &((pm_interpolated_x_string_node_t *) node)->parts;
                     } else {
-                        pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening);
-
-                        lex_state_set(parser, PM_LEX_STATE_END);
-                        expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
-
-                        pm_interpolated_string_node_closing_set(cast, &parser->previous);
-                        cast->base.location = cast->opening_loc;
-                        node = (pm_node_t *) cast;
+                        nodes = &((pm_interpolated_string_node_t *) node)->parts;
                     }
 
-                    // If this is a heredoc that is indented with a ~, then we need
-                    // to dedent each line by the common leading whitespace.
-                    size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
-                    if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
-                        pm_node_list_t *nodes;
-                        if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
-                            nodes = &((pm_interpolated_x_string_node_t *) node)->parts;
-                        } else {
-                            nodes = &((pm_interpolated_string_node_t *) node)->parts;
-                        }
-
-                        parse_heredoc_dedent(parser, nodes, common_whitespace);
-                    }
+                    parse_heredoc_dedent(parser, nodes, common_whitespace);
                 }
             }
 
@@ -13586,11 +13631,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                 if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
                     pm_token_t opening = not_provided(parser);
                     pm_token_t closing = not_provided(parser);
-
-                    pm_symbol_node_t *symbol = (pm_symbol_node_t *) pm_symbol_node_create(parser, &opening, &parser->current, &closing);
-                    symbol->unescaped = parser->current_string;
-
-                    pm_array_node_elements_append(array, (pm_node_t *) symbol);
+                    pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing));
                 }
 
                 expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT);
@@ -13633,11 +13674,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                             // If we hit content and the current node is NULL, then this is
                             // the first string content we've seen. In that case we're going
                             // to create a new string node and set that to the current.
-                            pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->current, &closing);
-                            symbol->unescaped = parser->current_string;
-
+                            current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing);
                             parser_lex(parser);
-                            current = (pm_node_t *) symbol;
                         } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) {
                             // If we hit string content and the current node is an
                             // interpolated string, then we need to append the string content
@@ -13930,6 +13968,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                 // expression at least has something in it. We'll need to check if the
                 // following token is the end (in which case we can return a plain
                 // regular expression) or if it's not then it has interpolation.
+                pm_string_t unescaped = parser->current_string;
                 pm_token_t content = parser->current;
                 parser_lex(parser);
 
@@ -13937,7 +13976,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                 // without interpolation, which can be represented more succinctly and
                 // more easily compiled.
                 if (accept1(parser, PM_TOKEN_REGEXP_END)) {
-                    return (pm_node_t *) pm_regular_expression_node_create_and_unescape(parser, &opening, &content, &parser->previous, PM_UNESCAPE_ALL);
+                    return (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
                 }
 
                 // If we get here, then we have interpolation so we'll need to create
@@ -13946,7 +13985,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
 
                 pm_token_t opening = not_provided(parser);
                 pm_token_t closing = not_provided(parser);
-                pm_node_t *part = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_ALL);
+                pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
                 pm_interpolated_regular_expression_node_append(node, part);
             } else {
                 // If the first part of the body of the regular expression is not a
@@ -13957,9 +13996,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
 
             // Now that we're here and we have interpolation, we'll parse all of the
             // parts into the list.
+            pm_node_t *part;
             while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) {
-                pm_node_t *part = parse_string_part(parser);
-                if (part != NULL) {
+                if ((part = parse_string_part(parser)) != NULL) {
                     pm_interpolated_regular_expression_node_append(node, part);
                 }
             }
@@ -14023,19 +14062,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
                 node = pm_interpolated_xstring_node_create(parser, &opening, &opening);
             }
 
+            pm_node_t *part;
             while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
-                pm_node_t *part = NULL;
-
-                if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
-                    pm_token_t opening = not_provided(parser);
-                    pm_token_t closing = not_provided(parser);
-                    part = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
-                    parser_lex(parser);
-                } else {
-                    part = parse_string_part(parser);
-                }
-
-                if (part != NULL) {
+                if ((part = parse_string_part(parser)) != NULL) {
                     pm_interpolated_xstring_node_append(node, part);
                 }
             }
diff --git a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
index e158069bb6..e9bb768383 100644
--- a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
+++ b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
@@ -21,7 +21,7 @@
         │   │       │   ├── opening_loc: (1,15)-(1,16) = "/"
         │   │       │   ├── content_loc: (1,16)-(1,20) = "^\\s{"
         │   │       │   ├── closing_loc: (1,20)-(1,21) = "/"
-        │   │       │   ├── unescaped: "^ {"
+        │   │       │   ├── unescaped: "^\\s{"
         │   │       │   └── flags: ∅
         │   │       └── @ StringNode (location: (1,23)-(1,25))
         │   │           ├── flags: ∅
@@ -51,7 +51,7 @@
             │       │   ├── opening_loc: (5,15)-(5,16) = "/"
             │       │   ├── content_loc: (5,16)-(5,20) = "^\\s{"
             │       │   ├── closing_loc: (5,20)-(5,21) = "/"
-            │       │   ├── unescaped: "^ {"
+            │       │   ├── unescaped: "^\\s{"
             │       │   └── flags: ∅
             │       └── @ StringNode (location: (5,23)-(5,25))
             │           ├── flags: ∅
diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt
index ff0e3d3b56..5fa07265a3 100644
--- a/test/prism/snapshots/regex.txt
+++ b/test/prism/snapshots/regex.txt
@@ -31,7 +31,7 @@
         │   ├── opening_loc: (5,0)-(5,1) = "/"
         │   ├── content_loc: (5,1)-(5,4) = "a\\b"
         │   ├── closing_loc: (5,4)-(5,5) = "/"
-        │   ├── unescaped: "a\b"
+        │   ├── unescaped: "a\\b"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
         │   ├── opening_loc: (7,0)-(7,1) = "/"
@@ -130,25 +130,25 @@
         │   ├── opening_loc: (15,0)-(15,3) = "%r/"
         │   ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:"
         │   ├── closing_loc: (15,24)-(15,26) = "/i"
-        │   ├── unescaped: "[a-z$._?][w$.?\#@~]*:"
+        │   ├── unescaped: "[a-z$._?][\\w$.?\#@~]*:"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (17,0)-(17,37))
         │   ├── opening_loc: (17,0)-(17,3) = "%r/"
         │   ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         │   ├── closing_loc: (17,35)-(17,37) = "/i"
-        │   ├── unescaped: "([a-z$._?][w$.?\#@~]*)( +)(equ)"
+        │   ├── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (19,0)-(19,25))
         │   ├── opening_loc: (19,0)-(19,3) = "%r/"
         │   ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*"
         │   ├── closing_loc: (19,23)-(19,25) = "/i"
-        │   ├── unescaped: "[a-z$._?][w$.?\#@~]*"
+        │   ├── unescaped: "[a-z$._?][\\w$.?\#@~]*"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (21,0)-(24,1))
         │   ├── opening_loc: (21,0)-(21,3) = "%r("
         │   ├── content_loc: (21,3)-(23,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
         │   ├── closing_loc: (24,0)-(24,1) = ")"
-        │   ├── unescaped: "\n(?:[w\#$%_']|()|(,)|[]|[0-9])*\n  (?:[w\#$%_']+)\n"
+        │   ├── unescaped: "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
         │   └── flags: ∅
         ├── @ CallNode (location: (26,0)-(26,16))
         │   ├── receiver:
@@ -156,7 +156,7 @@
         │   │   ├── opening_loc: (26,0)-(26,1) = "/"
         │   │   ├── content_loc: (26,1)-(26,7) = "(?#\\))"
         │   │   ├── closing_loc: (26,7)-(26,8) = "/"
-        │   │   ├── unescaped: "(?#))"
+        │   │   ├── unescaped: "(?#\\))"
         │   │   └── flags: ∅
         │   ├── call_operator_loc: ∅
         │   ├── message_loc: (26,9)-(26,11) = "=~"
diff --git a/test/prism/snapshots/seattlerb/bug190.txt b/test/prism/snapshots/seattlerb/bug190.txt
index 527304835a..fec48914c9 100644
--- a/test/prism/snapshots/seattlerb/bug190.txt
+++ b/test/prism/snapshots/seattlerb/bug190.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,3) = "%r'"
             ├── content_loc: (1,3)-(1,5) = "\\'"
             ├── closing_loc: (1,5)-(1,6) = "'"
-            ├── unescaped: "'"
+            ├── unescaped: "\\'"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
index 3bc991033c..caf67b892d 100644
--- a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,6) = "\\cC\\d"
             ├── closing_loc: (1,6)-(1,7) = "/"
-            ├── unescaped: "\u0003d"
+            ├── unescaped: "\\x03\\d"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_esc_u.txt b/test/prism/snapshots/seattlerb/regexp_esc_u.txt
index adbfe36880..ea6bbb6141 100644
--- a/test/prism/snapshots/seattlerb/regexp_esc_u.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_u.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,16) = "[\\u0021-\\u0027]"
             ├── closing_loc: (1,16)-(1,17) = "/"
-            ├── unescaped: "[!-']"
+            ├── unescaped: "[\\u0021-\\u0027]"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
index 5e039bd16e..74e8b52787 100644
--- a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
+++ b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
@@ -7,11 +7,11 @@
         │   ├── opening_loc: (1,0)-(1,1) = "/"
         │   ├── content_loc: (1,1)-(1,14) = "\\u{c0de babe}"
         │   ├── closing_loc: (1,14)-(1,15) = "/"
-        │   ├── unescaped: "샞몾"
+        │   ├── unescaped: "\\u{c0de babe}"
         │   └── flags: ∅
         └── @ RegularExpressionNode (location: (3,0)-(3,8))
             ├── opening_loc: (3,0)-(3,1) = "/"
             ├── content_loc: (3,1)-(3,7) = "\\u{df}"
             ├── closing_loc: (3,7)-(3,8) = "/"
-            ├── unescaped: "ß"
+            ├── unescaped: "\\u{df}"
             └── flags: ∅
diff --git a/test/prism/snapshots/spanning_heredoc.txt b/test/prism/snapshots/spanning_heredoc.txt
index 2c59cb4368..6b3e3c92d7 100644
--- a/test/prism/snapshots/spanning_heredoc.txt
+++ b/test/prism/snapshots/spanning_heredoc.txt
@@ -28,10 +28,10 @@
         │   │           │       ├── @ InterpolatedRegularExpressionNode (location: (4,13)-(7,2))
         │   │           │       │   ├── opening_loc: (4,13)-(4,14) = "/"
         │   │           │       │   ├── parts: (length: 2)
-        │   │           │       │   │   ├── @ StringNode (location: (4,14)-(4,0))
+        │   │           │       │   │   ├── @ StringNode (location: (4,14)-(4,16))
         │   │           │       │   │   │   ├── flags: ∅
         │   │           │       │   │   │   ├── opening_loc: ∅
-        │   │           │       │   │   │   ├── content_loc: (4,14)-(4,0) = "b\\\n"
+        │   │           │       │   │   │   ├── content_loc: (4,14)-(4,16) = "b\\"
         │   │           │       │   │   │   ├── closing_loc: ∅
         │   │           │       │   │   │   └── unescaped: "b"
         │   │           │       │   │   └── @ StringNode (location: (7,0)-(7,1))
diff --git a/test/prism/snapshots/unescaping.txt b/test/prism/snapshots/unescaping.txt
index a59dc01626..ee7c3759cb 100644
--- a/test/prism/snapshots/unescaping.txt
+++ b/test/prism/snapshots/unescaping.txt
@@ -17,7 +17,7 @@
         │   ├── opening_loc: (3,0)-(3,1) = "/"
         │   ├── content_loc: (3,1)-(3,7) = "\\c\#{1}"
         │   ├── closing_loc: (3,7)-(3,8) = "/"
-        │   ├── unescaped: "\u0003{1}"
+        │   ├── unescaped: "\\x03{1}"
         │   └── flags: ∅
         ├── @ StringNode (location: (5,0)-(5,8))
         │   ├── flags: ∅
diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt
index 7c477382dc..21e73552ef 100644
--- a/test/prism/snapshots/unparser/corpus/literal/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt
@@ -545,7 +545,7 @@
         │   ├── opening_loc: (50,0)-(50,1) = "/"
         │   ├── content_loc: (50,1)-(50,27) = "[^-+',.\\/:@[:alnum:]\\[\\]]+"
         │   ├── closing_loc: (50,27)-(50,28) = "/"
-        │   ├── unescaped: "[^-+',./:@[:alnum:][]]+"
+        │   ├── unescaped: "[^-+',./:@[:alnum:]\\[\\]]+"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (51,0)-(51,12))
         │   ├── opening_loc: (51,0)-(51,1) = "/"
@@ -606,19 +606,19 @@
         │   ├── opening_loc: (54,0)-(54,1) = "/"
         │   ├── content_loc: (54,1)-(54,3) = "\\n"
         │   ├── closing_loc: (54,3)-(54,4) = "/"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: ∅
         ├── @ RegularExpressionNode (location: (55,0)-(55,4))
         │   ├── opening_loc: (55,0)-(55,1) = "/"
         │   ├── content_loc: (55,1)-(55,3) = "\\n"
         │   ├── closing_loc: (55,3)-(55,4) = "/"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: ∅
         ├── @ RegularExpressionNode (location: (56,0)-(56,5))
         │   ├── opening_loc: (56,0)-(56,1) = "/"
         │   ├── content_loc: (56,1)-(56,3) = "\\n"
         │   ├── closing_loc: (56,3)-(56,5) = "/x"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: extended
         ├── @ RegularExpressionNode (location: (57,0)-(57,7))
         │   ├── opening_loc: (57,0)-(57,1) = "/"
diff --git a/test/prism/snapshots/unparser/corpus/semantic/literal.txt b/test/prism/snapshots/unparser/corpus/semantic/literal.txt
index c79d0370da..6da3b56f33 100644
--- a/test/prism/snapshots/unparser/corpus/semantic/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/semantic/literal.txt
@@ -33,7 +33,7 @@
         │   ├── opening_loc: (10,0)-(10,3) = "%r("
         │   ├── content_loc: (10,3)-(10,5) = "\\)"
         │   ├── closing_loc: (10,5)-(10,6) = ")"
-        │   ├── unescaped: ")"
+        │   ├── unescaped: "\\)"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (11,0)-(11,14))
         │   ├── opening_loc: (11,0)-(11,3) = "%r("
diff --git a/test/prism/snapshots/whitequark/parser_bug_830.txt b/test/prism/snapshots/whitequark/parser_bug_830.txt
index f19fffbba0..e380113372 100644
--- a/test/prism/snapshots/whitequark/parser_bug_830.txt
+++ b/test/prism/snapshots/whitequark/parser_bug_830.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,3) = "\\("
             ├── closing_loc: (1,3)-(1,4) = "/"
-            ├── unescaped: "("
+            ├── unescaped: "\\("
             └── flags: ∅
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
index 123c139077..051b5e29d1 100644
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@@ -108,40 +108,50 @@ module Prism
     escapes = [*ascii, *ascii8, *newlines, *octal, *hex2, *hex4, *hex6, *ctrls]
 
     contexts = [
-      [Context::String.new("?", ""),              escapes],
-      [Context::String.new("'", "'"),             escapes],
-      [Context::String.new("\"", "\""),           escapes],
-      [Context::String.new("%q[", "]"),           escapes],
-      [Context::String.new("%Q[", "]"),           escapes],
-      [Context::String.new("%[", "]"),            escapes],
-      [Context::String.new("`", "`"),             escapes],
-      [Context::String.new("%x[", "]"),           escapes],
-      [Context::String.new("<<H\n", "\nH"),       escapes],
-      [Context::String.new("<<'H'\n", "\nH"),     escapes],
-      [Context::String.new("<<\"H\"\n", "\nH"),   escapes],
-      [Context::String.new("<<`H`\n", "\nH"),     escapes],
-      [Context::String.new("<<-H\n", "\nH"),      escapes],
-      [Context::String.new("<<-'H'\n", "\nH"),    escapes],
-      [Context::String.new("<<-\"H\"\n", "\nH"),  escapes],
-      [Context::String.new("<<-`H`\n", "\nH"),    escapes],
-      [Context::Heredoc.new("<<~H\n", "\nH"),     escapes],
-      [Context::Heredoc.new("<<~'H'\n", "\nH"),   escapes],
-      [Context::Heredoc.new("<<~\"H\"\n", "\nH"), escapes],
-      [Context::Heredoc.new("<<~`H`\n", "\nH"),   escapes],
-      [Context::List.new("%w[", "]"),             escapes],
-      [Context::List.new("%W[", "]"),             escapes],
-      [Context::List.new("%i[", "]"),             escapes],
-      [Context::List.new("%I[", "]"),             escapes],
-      [Context::Symbol.new("%s[", "]"),           escapes],
-      [Context::Symbol.new(":'", "'"),            escapes],
-      [Context::Symbol.new(":\"", "\""),          escapes],
-      # [Context::RegExp.new("/", "/"),            escapes],
-      # [Context::RegExp.new("%r[", "]"),          escapes]
+      Context::String.new("?", ""),
+      Context::String.new("'", "'"),
+      Context::String.new("\"", "\""),
+      Context::String.new("%q[", "]"),
+      Context::String.new("%Q[", "]"),
+      Context::String.new("%[", "]"),
+      Context::String.new("`", "`"),
+      Context::String.new("%x[", "]"),
+      Context::String.new("<<H\n", "\nH"),
+      Context::String.new("<<'H'\n", "\nH"),
+      Context::String.new("<<\"H\"\n", "\nH"),
+      Context::String.new("<<`H`\n", "\nH"),
+      Context::String.new("<<-H\n", "\nH"),
+      Context::String.new("<<-'H'\n", "\nH"),
+      Context::String.new("<<-\"H\"\n", "\nH"),
+      Context::String.new("<<-`H`\n", "\nH"),
+      Context::Heredoc.new("<<~H\n", "\nH"),
+      Context::Heredoc.new("<<~'H'\n", "\nH"),
+      Context::Heredoc.new("<<~\"H\"\n", "\nH"),
+      Context::Heredoc.new("<<~`H`\n", "\nH"),
+      Context::List.new("%w[", "]"),
+      Context::List.new("%w<", ">"),
+      Context::List.new("%W[", "]"),
+      Context::List.new("%i[", "]"),
+      Context::List.new("%I[", "]"),
+      Context::Symbol.new("%s[", "]"),
+      Context::Symbol.new(":'", "'"),
+      Context::Symbol.new(":\"", "\""),
+      Context::RegExp.new("/", "/"),
+      Context::RegExp.new("%r[", "]"),
+      Context::RegExp.new("%r<", ">"),
+      Context::RegExp.new("%r{", "}"),
+      Context::RegExp.new("%r(", ")"),
+      Context::RegExp.new("%r|", "|"),
     ]
 
-    contexts.each do |(context, escapes)|
+    contexts.each do |context|
       escapes.each do |escape|
-        next if context.name == "?" && escape == "\xFF".b # wat?
+        # I think this might be a bug in Ruby.
+        next if context.name == "?" && escape == "\xFF".b
+
+        # We don't currently support scanning for the number of capture groups,
+        # so these are all going to fail.
+        next if (context.name == "//" || context.name.start_with?("%r")) && escape.start_with?(/\d/)
 
         define_method(:"test_#{context.name}_#{escape.inspect}") do
           assert_unescape(context, escape)
author	Kevin Newton <kddnewton@gmail.com>	2023-10-12 08:46:40 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2023-10-13 15:31:30 -0400
commit	fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch)
tree	bf98c1898db99a2d35aa759c98dfb259f02055a5
parent	e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff)