[ruby/prism] Refactor regexp lexing to make it easier to support CLRF

https://github.com/ruby/prism/commit/60805d85ca
author: Kevin Newton <kddnewton@gmail.com> 2024-03-25 09:24:35 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2024-03-25 11:52:12 -0400
commit: 86077fbcde05f4abd6b306ad0fcc88ee891f8e8b (patch)
tree: 922cc80305c88e318e8488261dae34392d7feeb7
parent: 0bc764b7298fe79ded630841f0781979a9d0d806 (diff)
2 files changed, 120 insertions, 121 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 9f0ecdb938..6aa611624a 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -10783,36 +10783,6 @@ parser_lex(pm_parser_t *parser) {
             pm_regexp_token_buffer_t token_buffer = { 0 };
 
             while (breakpoint != NULL) {
-                // If we hit a null byte, skip directly past it.
-                if (*breakpoint == '\0') {
-                    parser->current.end = breakpoint + 1;
-                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-                    continue;
-                }
-
-                // If we've hit a newline, then we need to track that in the
-                // list of newlines.
-                if (*breakpoint == '\n') {
-                    // For the special case of a newline-terminated regular expression, we will pass
-                    // through this branch twice -- once with PM_TOKEN_REGEXP_BEGIN and then again
-                    // with PM_TOKEN_STRING_CONTENT. Let's avoid tracking the newline twice, by
-                    // tracking it only in the REGEXP_BEGIN case.
-                    if (
-                        !(lex_mode->as.regexp.terminator == '\n' && parser->current.type != PM_TOKEN_REGEXP_BEGIN)
-                        && parser->heredoc_end == NULL
-                    ) {
-                        pm_newline_list_append(&parser->newline_list, breakpoint);
-                    }
-
-                    if (lex_mode->as.regexp.terminator != '\n') {
-                        // If the terminator is not a newline, then we can set
-                        // the next breakpoint and continue.
-                        parser->current.end = breakpoint + 1;
-                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-                        continue;
-                    }
-                }
-
                 // If we hit the terminator, we need to determine what kind of
                 // token to return.
                 if (*breakpoint == lex_mode->as.regexp.terminator) {
@@ -10832,9 +10802,17 @@ parser_lex(pm_parser_t *parser) {
                         LEX(PM_TOKEN_STRING_CONTENT);
                     }
 
+                    // Check here if we need to track the newline.
+                    size_t eol_length = match_eol_at(parser, breakpoint);
+                    if (eol_length) {
+                        parser->current.end = breakpoint + eol_length;
+                        pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
+                    } else {
+                        parser->current.end = breakpoint + 1;
+                    }
+
                     // Since we've hit the terminator of the regular expression,
                     // we now need to parse the options.
-                    parser->current.end = breakpoint + 1;
                     parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
 
                     lex_mode_pop(parser);
@@ -10842,114 +10820,135 @@ parser_lex(pm_parser_t *parser) {
                     LEX(PM_TOKEN_REGEXP_END);
                 }
 
-                // If we hit escapes, then we need to treat the next token
-                // literally. In this case we'll skip past the next character
+                // If we've hit the incrementor, then we need to skip past it
                 // and find the next breakpoint.
-                if (*breakpoint == '\\') {
+                if (*breakpoint && *breakpoint == lex_mode->as.regexp.incrementor) {
                     parser->current.end = breakpoint + 1;
+                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+                    lex_mode->as.regexp.nesting++;
+                    continue;
+                }
 
-                    // If we've hit the end of the file, then break out of the
-                    // loop by setting the breakpoint to NULL.
-                    if (parser->current.end == parser->end) {
-                        breakpoint = NULL;
-                        continue;
-                    }
-
-                    pm_regexp_token_buffer_escape(parser, &token_buffer);
-                    uint8_t peeked = peek(parser);
+                switch (*breakpoint) {
+                    case '\0':
+                        // If we hit a null byte, skip directly past it.
+                        parser->current.end = breakpoint + 1;
+                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+                        break;
+                    case '\n':
+                        // If we've hit a newline, then we need to track that in
+                        // the list of newlines.
+                        if (parser->heredoc_end == NULL) {
+                            pm_newline_list_append(&parser->newline_list, breakpoint);
+                        }
 
-                    switch (peeked) {
-                        case '\r':
-                            parser->current.end++;
-                            if (peek(parser) != '\n') {
-                                if (lex_mode->as.regexp.terminator != '\r') {
-                                    pm_token_buffer_push_byte(&token_buffer.base, '\\');
-                                }
-                                pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
-                                pm_token_buffer_push_byte(&token_buffer.base, '\r');
-                                break;
-                            }
-                        /* fallthrough */
-                        case '\n':
-                            if (parser->heredoc_end) {
-                                // ... if we are on the same line as a heredoc,
-                                // flush the heredoc and continue parsing after
-                                // heredoc_end.
-                                parser_flush_heredoc_end(parser);
-                                pm_regexp_token_buffer_copy(parser, &token_buffer);
-                                LEX(PM_TOKEN_STRING_CONTENT);
-                            } else {
-                                // ... else track the newline.
-                                pm_newline_list_append(&parser->newline_list, parser->current.end);
-                            }
+                        parser->current.end = breakpoint + 1;
+                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+                        break;
+                    case '\\': {
+                        // If we hit escapes, then we need to treat the next
+                        // token literally. In this case we'll skip past the
+                        // next character and find the next breakpoint.
+                        parser->current.end = breakpoint + 1;
 
-                            parser->current.end++;
-                            break;
-                        case 'c':
-                        case 'C':
-                        case 'M':
-                        case 'u':
-                        case 'x':
-                            escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP);
+                        // If we've hit the end of the file, then break out of
+                        // the loop by setting the breakpoint to NULL.
+                        if (parser->current.end == parser->end) {
+                            breakpoint = NULL;
                             break;
-                        default:
-                            if (lex_mode->as.regexp.terminator == peeked) {
-                                // Some characters when they are used as the
-                                // terminator also receive an escape. They are
-                                // enumerated here.
-                                switch (peeked) {
-                                    case '$': case ')': case '*': case '+':
-                                    case '.': case '>': case '?': case ']':
-                                    case '^': case '|': case '}':
+                        }
+
+                        pm_regexp_token_buffer_escape(parser, &token_buffer);
+                        uint8_t peeked = peek(parser);
+
+                        switch (peeked) {
+                            case '\r':
+                                parser->current.end++;
+                                if (peek(parser) != '\n') {
+                                    if (lex_mode->as.regexp.terminator != '\r') {
                                         pm_token_buffer_push_byte(&token_buffer.base, '\\');
-                                        break;
-                                    default:
-                                        break;
+                                    }
+                                    pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
+                                    pm_token_buffer_push_byte(&token_buffer.base, '\r');
+                                    break;
+                                }
+                            /* fallthrough */
+                            case '\n':
+                                if (parser->heredoc_end) {
+                                    // ... if we are on the same line as a heredoc,
+                                    // flush the heredoc and continue parsing after
+                                    // heredoc_end.
+                                    parser_flush_heredoc_end(parser);
+                                    pm_regexp_token_buffer_copy(parser, &token_buffer);
+                                    LEX(PM_TOKEN_STRING_CONTENT);
+                                } else {
+                                    // ... else track the newline.
+                                    pm_newline_list_append(&parser->newline_list, parser->current.end);
                                 }
 
-                                pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
-                                pm_token_buffer_push_byte(&token_buffer.base, peeked);
                                 parser->current.end++;
                                 break;
-                            }
-
-                            if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\');
-                            pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
-                            break;
-                    }
+                            case 'c':
+                            case 'C':
+                            case 'M':
+                            case 'u':
+                            case 'x':
+                                escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP);
+                                break;
+                            default:
+                                if (lex_mode->as.regexp.terminator == peeked) {
+                                    // Some characters when they are used as the
+                                    // terminator also receive an escape. They are
+                                    // enumerated here.
+                                    switch (peeked) {
+                                        case '$': case ')': case '*': case '+':
+                                        case '.': case '>': case '?': case ']':
+                                        case '^': case '|': case '}':
+                                            pm_token_buffer_push_byte(&token_buffer.base, '\\');
+                                            break;
+                                        default:
+                                            break;
+                                    }
 
-                    token_buffer.base.cursor = parser->current.end;
-                    breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-                    continue;
-                }
+                                    pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
+                                    pm_token_buffer_push_byte(&token_buffer.base, peeked);
+                                    parser->current.end++;
+                                    break;
+                                }
 
-                // If we hit a #, then we will attempt to lex interpolation.
-                if (*breakpoint == '#') {
-                    pm_token_type_t type = lex_interpolation(parser, breakpoint);
+                                if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\');
+                                pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
+                                break;
+                        }
 
-                    if (type == PM_TOKEN_NOT_PROVIDED) {
-                        // If we haven't returned at this point then we had
-                        // something that looked like an interpolated class or
-                        // instance variable like "#@" but wasn't actually. In
-                        // this case we'll just skip to the next breakpoint.
+                        token_buffer.base.cursor = parser->current.end;
                         breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-                        continue;
+                        break;
                     }
+                    case '#': {
+                        // If we hit a #, then we will attempt to lex
+                        // interpolation.
+                        pm_token_type_t type = lex_interpolation(parser, breakpoint);
 
-                    if (type == PM_TOKEN_STRING_CONTENT) {
-                        pm_regexp_token_buffer_flush(parser, &token_buffer);
-                    }
+                        if (type == PM_TOKEN_NOT_PROVIDED) {
+                            // If we haven't returned at this point then we had
+                            // something that looked like an interpolated class or
+                            // instance variable like "#@" but wasn't actually. In
+                            // this case we'll just skip to the next breakpoint.
+                            breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+                            break;
+                        }
 
-                    LEX(type);
-                }
+                        if (type == PM_TOKEN_STRING_CONTENT) {
+                            pm_regexp_token_buffer_flush(parser, &token_buffer);
+                        }
 
-                // If we've hit the incrementor, then we need to skip past it
-                // and find the next breakpoint.
-                assert(*breakpoint == lex_mode->as.regexp.incrementor);
-                parser->current.end = breakpoint + 1;
-                breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
-                lex_mode->as.regexp.nesting++;
-                continue;
+                        LEX(type);
+                    }
+                    default:
+                        assert(false && "unreachable");
+                        break;
+                }
             }
 
             if (parser->current.end > parser->current.start) {
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 649d05b874..2aee473ddf 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -344,7 +344,7 @@ module Prism
           next if ["/", "{"].include?(character)
 
           source = "# encoding: #{name}\n/(?##{character})/\n"
-          assert Prism.parse(source).success?
+          assert Prism.parse(source).success?, "Expected #{source.inspect} to parse successfully."
         end
       rescue RangeError
         source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
author	Kevin Newton <kddnewton@gmail.com>	2024-03-25 09:24:35 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2024-03-25 11:52:12 -0400
commit	86077fbcde05f4abd6b306ad0fcc88ee891f8e8b (patch)
tree	922cc80305c88e318e8488261dae34392d7feeb7
parent	0bc764b7298fe79ded630841f0781979a9d0d806 (diff)