[ruby/prism] Use current_string to handle %w escapes

https://github.com/ruby/prism/commit/b8420ea7ae
author: Kevin Newton <kddnewton@gmail.com> 2023-10-10 14:13:53 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2023-10-13 15:31:30 -0400
commit: 4e3013f42d4562d8e84e88e20e2f76668e8a58f6 (patch)
tree: 9cddcd64baee4601c3d46ebe4d2b5c646d4bce69
parent: 3dba3ab47d8d83378d916610d8f4769663ebdf27 (diff)
2 files changed, 99 insertions, 29 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 44d081237b..3d103afa01 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6064,6 +6064,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) {
     }
 }
 
+// static const uint8_t PM_ESCAPE_FLAG_NONE = 0x0;
 static const uint8_t PM_ESCAPE_FLAG_CONTROL = 0x1;
 static const uint8_t PM_ESCAPE_FLAG_META = 0x2;
 static const uint8_t PM_ESCAPE_FLAG_SINGLE = 0x4;
@@ -7825,6 +7826,11 @@ parser_lex(pm_parser_t *parser) {
             const uint8_t *breakpoints = lex_mode->as.list.breakpoints;
             const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
 
+            // If we haven't found an escape yet, then this buffer will be
+            // unallocated since we can refer directly to the source string.
+            pm_buffer_t buffer = (pm_buffer_t) { .value = NULL, .length = 0, .capacity = 0 };
+            const uint8_t *buffer_cursor = NULL;
+
             while (breakpoint != NULL) {
                 // If we hit a null byte, skip directly past it.
                 if (*breakpoint == '\0') {
@@ -7836,10 +7842,18 @@ parser_lex(pm_parser_t *parser) {
                 // now, so we can return an element of the list.
                 if (pm_char_is_whitespace(*breakpoint)) {
                     parser->current.end = breakpoint;
+
+                    if (buffer_cursor == NULL) {
+                        pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
+                    } else {
+                        pm_buffer_append_bytes(&buffer, buffer_cursor, (size_t) (parser->current.end - buffer_cursor));
+                        pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);
+                    }
+
                     LEX(PM_TOKEN_STRING_CONTENT);
                 }
 
-                //If we hit the terminator, we need to check which token to
+                // If we hit the terminator, we need to check which token to
                 // return.
                 if (*breakpoint == lex_mode->as.list.terminator) {
                     // If this terminator doesn't actually close the list, then
@@ -7854,6 +7868,14 @@ parser_lex(pm_parser_t *parser) {
                     // past content, then we can return a list node.
                     if (breakpoint > parser->current.start) {
                         parser->current.end = breakpoint;
+
+                        if (buffer_cursor == NULL) {
+                            pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
+                        } else {
+                            pm_buffer_append_bytes(&buffer, buffer_cursor, (size_t) (parser->current.end - buffer_cursor));
+                            pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);
+                        }
+
                         LEX(PM_TOKEN_STRING_CONTENT);
                     }
 
@@ -7869,45 +7891,90 @@ parser_lex(pm_parser_t *parser) {
                 // literally. In this case we'll skip past the next character
                 // and find the next breakpoint.
                 if (*breakpoint == '\\') {
-                    pm_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? PM_UNESCAPE_ALL : PM_UNESCAPE_MINIMAL;
-                    size_t difference = pm_unescape_calculate_difference(parser, breakpoint, unescape_type);
-                    if (difference == 0) {
-                        // we're at the end of the file
+                    if (buffer_cursor == NULL) {
+                        pm_buffer_init_capacity(&buffer, 16);
+                        pm_buffer_append_bytes(&buffer, parser->current.start, (size_t) (breakpoint - parser->current.start));
+                    } else {
+                        pm_buffer_append_bytes(&buffer, buffer_cursor, (size_t) (breakpoint - buffer_cursor));
+                    }
+
+                    parser->current.end = breakpoint + 1;
+
+                    // If we've hit the end of the file, then break out of the
+                    // loop by setting the breakpoint to NULL.
+                    if (parser->current.end == parser->end) {
                         breakpoint = NULL;
                         continue;
                     }
 
-                    // If the result is an escaped newline ...
-                    if (breakpoint[difference - 1] == '\n') {
-                        if (parser->heredoc_end) {
-                            // ... if we are on the same line as a heredoc, flush the heredoc and
-                            // continue parsing after heredoc_end.
-                            parser->current.end = breakpoint + difference;
-                            parser_flush_heredoc_end(parser);
-                            LEX(PM_TOKEN_STRING_CONTENT);
-                        } else {
-                            // ... else track the newline.
-                            pm_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
-                        }
+                    uint8_t peeked = peek(parser);
+                    switch (peeked) {
+                        case ' ':
+                        case '\f':
+                        case '\t':
+                        case '\v':
+                        case '\\':
+                            pm_buffer_append_u8(&buffer, peeked);
+                            break;
+                        case '\r':
+                            pm_buffer_append_u8(&buffer, '\r');
+                            if (peek_offset(parser, 1) != '\n') break;
+
+                            parser->current.end++;
+                        /* fallthrough */
+                        case '\n':
+                            pm_buffer_append_u8(&buffer, '\n');
+
+                            if (parser->heredoc_end) {
+                                // ... if we are on the same line as a heredoc,
+                                // flush the heredoc and continue parsing after
+                                // heredoc_end.
+                                parser_flush_heredoc_end(parser);
+                                LEX(PM_TOKEN_STRING_CONTENT);
+                            } else {
+                                // ... else track the newline.
+                                pm_newline_list_append(&parser->newline_list, parser->current.end);
+                            }
+
+                            break;
+                        default:
+                            if (peeked != lex_mode->as.list.incrementor && peeked != lex_mode->as.list.terminator) {
+                                pm_buffer_append_u8(&buffer, '\\');
+                            }
+                            pm_buffer_append_u8(&buffer, peeked);
+                            break;
                     }
 
-                    breakpoint = pm_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
+                    parser->current.end++;
+                    buffer_cursor = parser->current.end;
+
+                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
                     continue;
                 }
 
                 // If we hit a #, then we will attempt to lex interpolation.
                 if (*breakpoint == '#') {
                     pm_token_type_t type = lex_interpolation(parser, breakpoint);
-                    if (type != PM_TOKEN_NOT_PROVIDED) {
-                        LEX(type);
+
+                    if (type == PM_TOKEN_NOT_PROVIDED) {
+                        // If we haven't returned at this point then we had something
+                        // that looked like an interpolated class or instance variable
+                        // like "#@" but wasn't actually. In this case we'll just skip
+                        // to the next breakpoint.
+                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
+                        continue;
                     }
 
-                    // If we haven't returned at this point then we had something
-                    // that looked like an interpolated class or instance variable
-                    // like "#@" but wasn't actually. In this case we'll just skip
-                    // to the next breakpoint.
-                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end);
-                    continue;
+                    if (type == PM_TOKEN_STRING_CONTENT) {
+                        if (buffer_cursor == NULL) {
+                            pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
+                        } else {
+                            pm_buffer_append_bytes(&buffer, buffer_cursor, (size_t) (parser->current.end - buffer_cursor));
+                            pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length);
+                        }
+                    }
+
+                    LEX(type);
                 }
 
                 // If we've hit the incrementor, then we need to skip past it
@@ -13491,8 +13558,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
 
                 pm_token_t opening = not_provided(parser);
                 pm_token_t closing = not_provided(parser);
-                pm_node_t *string = (pm_node_t *) pm_string_node_create_and_unescape(parser, &opening, &parser->previous, &closing, PM_UNESCAPE_WHITESPACE);
-                pm_array_node_elements_append(array, string);
+
+                pm_string_node_t *string = (pm_string_node_t *) pm_string_node_create(parser, &opening, &parser->previous, &closing);
+                string->unescaped = parser->current_string;
+
+                pm_array_node_elements_append(array, (pm_node_t *) string);
             }
 
             expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM);
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
index fe64592861..d93c50f1fc 100644
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@@ -103,7 +103,7 @@ module Prism
       # [Context::String.new("<<~'H'\n", "\nH"),   escapes],
       # [Context::String.new("<<~\"H\"\n", "\nH"), escapes],
       # [Context::String.new("<<~`H`\n", "\nH"),   escapes],
-      # [Context::List.new("%w[", "]"),            escapes],
+      [Context::List.new("%w[", "]"),            escapes],
       # [Context::List.new("%W[", "]"),            escapes],
       # [Context::List.new("%i[", "]"),            escapes],
       # [Context::List.new("%I[", "]"),            escapes],
author	Kevin Newton <kddnewton@gmail.com>	2023-10-10 14:13:53 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2023-10-13 15:31:30 -0400
commit	4e3013f42d4562d8e84e88e20e2f76668e8a58f6 (patch)
tree	9cddcd64baee4601c3d46ebe4d2b5c646d4bce69
parent	3dba3ab47d8d83378d916610d8f4769663ebdf27 (diff)