diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2024-03-25 09:24:35 -0400 |
|---|---|---|
| committer | Kevin Newton <kddnewton@gmail.com> | 2024-03-25 11:52:12 -0400 |
| commit | 86077fbcde05f4abd6b306ad0fcc88ee891f8e8b (patch) | |
| tree | 922cc80305c88e318e8488261dae34392d7feeb7 | |
| parent | 0bc764b7298fe79ded630841f0781979a9d0d806 (diff) | |
[ruby/prism] Refactor regexp lexing to make it easier to support CLRF
https://github.com/ruby/prism/commit/60805d85ca
| -rw-r--r-- | prism/prism.c | 239 | ||||
| -rw-r--r-- | test/prism/encoding_test.rb | 2 |
2 files changed, 120 insertions, 121 deletions
diff --git a/prism/prism.c b/prism/prism.c index 9f0ecdb938..6aa611624a 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -10783,36 +10783,6 @@ parser_lex(pm_parser_t *parser) { pm_regexp_token_buffer_t token_buffer = { 0 }; while (breakpoint != NULL) { - // If we hit a null byte, skip directly past it. - if (*breakpoint == '\0') { - parser->current.end = breakpoint + 1; - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - continue; - } - - // If we've hit a newline, then we need to track that in the - // list of newlines. - if (*breakpoint == '\n') { - // For the special case of a newline-terminated regular expression, we will pass - // through this branch twice -- once with PM_TOKEN_REGEXP_BEGIN and then again - // with PM_TOKEN_STRING_CONTENT. Let's avoid tracking the newline twice, by - // tracking it only in the REGEXP_BEGIN case. - if ( - !(lex_mode->as.regexp.terminator == '\n' && parser->current.type != PM_TOKEN_REGEXP_BEGIN) - && parser->heredoc_end == NULL - ) { - pm_newline_list_append(&parser->newline_list, breakpoint); - } - - if (lex_mode->as.regexp.terminator != '\n') { - // If the terminator is not a newline, then we can set - // the next breakpoint and continue. - parser->current.end = breakpoint + 1; - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - continue; - } - } - // If we hit the terminator, we need to determine what kind of // token to return. if (*breakpoint == lex_mode->as.regexp.terminator) { @@ -10832,9 +10802,17 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_STRING_CONTENT); } + // Check here if we need to track the newline. + size_t eol_length = match_eol_at(parser, breakpoint); + if (eol_length) { + parser->current.end = breakpoint + eol_length; + pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + } else { + parser->current.end = breakpoint + 1; + } + // Since we've hit the terminator of the regular expression, // we now need to parse the options. - parser->current.end = breakpoint + 1; parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); lex_mode_pop(parser); @@ -10842,114 +10820,135 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_REGEXP_END); } - // If we hit escapes, then we need to treat the next token - // literally. In this case we'll skip past the next character + // If we've hit the incrementor, then we need to skip past it // and find the next breakpoint. - if (*breakpoint == '\\') { + if (*breakpoint && *breakpoint == lex_mode->as.regexp.incrementor) { parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + lex_mode->as.regexp.nesting++; + continue; + } - // If we've hit the end of the file, then break out of the - // loop by setting the breakpoint to NULL. - if (parser->current.end == parser->end) { - breakpoint = NULL; - continue; - } - - pm_regexp_token_buffer_escape(parser, &token_buffer); - uint8_t peeked = peek(parser); + switch (*breakpoint) { + case '\0': + // If we hit a null byte, skip directly past it. + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + case '\n': + // If we've hit a newline, then we need to track that in + // the list of newlines. + if (parser->heredoc_end == NULL) { + pm_newline_list_append(&parser->newline_list, breakpoint); + } - switch (peeked) { - case '\r': - parser->current.end++; - if (peek(parser) != '\n') { - if (lex_mode->as.regexp.terminator != '\r') { - pm_token_buffer_push_byte(&token_buffer.base, '\\'); - } - pm_regexp_token_buffer_push_byte(&token_buffer, '\r'); - pm_token_buffer_push_byte(&token_buffer.base, '\r'); - break; - } - /* fallthrough */ - case '\n': - if (parser->heredoc_end) { - // ... if we are on the same line as a heredoc, - // flush the heredoc and continue parsing after - // heredoc_end. - parser_flush_heredoc_end(parser); - pm_regexp_token_buffer_copy(parser, &token_buffer); - LEX(PM_TOKEN_STRING_CONTENT); - } else { - // ... else track the newline. - pm_newline_list_append(&parser->newline_list, parser->current.end); - } + parser->current.end = breakpoint + 1; + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + case '\\': { + // If we hit escapes, then we need to treat the next + // token literally. In this case we'll skip past the + // next character and find the next breakpoint. + parser->current.end = breakpoint + 1; - parser->current.end++; - break; - case 'c': - case 'C': - case 'M': - case 'u': - case 'x': - escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP); + // If we've hit the end of the file, then break out of + // the loop by setting the breakpoint to NULL. + if (parser->current.end == parser->end) { + breakpoint = NULL; break; - default: - if (lex_mode->as.regexp.terminator == peeked) { - // Some characters when they are used as the - // terminator also receive an escape. They are - // enumerated here. - switch (peeked) { - case '$': case ')': case '*': case '+': - case '.': case '>': case '?': case ']': - case '^': case '|': case '}': + } + + pm_regexp_token_buffer_escape(parser, &token_buffer); + uint8_t peeked = peek(parser); + + switch (peeked) { + case '\r': + parser->current.end++; + if (peek(parser) != '\n') { + if (lex_mode->as.regexp.terminator != '\r') { pm_token_buffer_push_byte(&token_buffer.base, '\\'); - break; - default: - break; + } + pm_regexp_token_buffer_push_byte(&token_buffer, '\r'); + pm_token_buffer_push_byte(&token_buffer.base, '\r'); + break; + } + /* fallthrough */ + case '\n': + if (parser->heredoc_end) { + // ... if we are on the same line as a heredoc, + // flush the heredoc and continue parsing after + // heredoc_end. + parser_flush_heredoc_end(parser); + pm_regexp_token_buffer_copy(parser, &token_buffer); + LEX(PM_TOKEN_STRING_CONTENT); + } else { + // ... else track the newline. + pm_newline_list_append(&parser->newline_list, parser->current.end); } - pm_regexp_token_buffer_push_byte(&token_buffer, peeked); - pm_token_buffer_push_byte(&token_buffer.base, peeked); parser->current.end++; break; - } - - if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\'); - pm_regexp_token_buffer_push_escaped(&token_buffer, parser); - break; - } + case 'c': + case 'C': + case 'M': + case 'u': + case 'x': + escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP); + break; + default: + if (lex_mode->as.regexp.terminator == peeked) { + // Some characters when they are used as the + // terminator also receive an escape. They are + // enumerated here. + switch (peeked) { + case '$': case ')': case '*': case '+': + case '.': case '>': case '?': case ']': + case '^': case '|': case '}': + pm_token_buffer_push_byte(&token_buffer.base, '\\'); + break; + default: + break; + } - token_buffer.base.cursor = parser->current.end; - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - continue; - } + pm_regexp_token_buffer_push_byte(&token_buffer, peeked); + pm_token_buffer_push_byte(&token_buffer.base, peeked); + parser->current.end++; + break; + } - // If we hit a #, then we will attempt to lex interpolation. - if (*breakpoint == '#') { - pm_token_type_t type = lex_interpolation(parser, breakpoint); + if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\'); + pm_regexp_token_buffer_push_escaped(&token_buffer, parser); + break; + } - if (type == PM_TOKEN_NOT_PROVIDED) { - // If we haven't returned at this point then we had - // something that looked like an interpolated class or - // instance variable like "#@" but wasn't actually. In - // this case we'll just skip to the next breakpoint. + token_buffer.base.cursor = parser->current.end; breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - continue; + break; } + case '#': { + // If we hit a #, then we will attempt to lex + // interpolation. + pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type == PM_TOKEN_STRING_CONTENT) { - pm_regexp_token_buffer_flush(parser, &token_buffer); - } + if (type == PM_TOKEN_NOT_PROVIDED) { + // If we haven't returned at this point then we had + // something that looked like an interpolated class or + // instance variable like "#@" but wasn't actually. In + // this case we'll just skip to the next breakpoint. + breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); + break; + } - LEX(type); - } + if (type == PM_TOKEN_STRING_CONTENT) { + pm_regexp_token_buffer_flush(parser, &token_buffer); + } - // If we've hit the incrementor, then we need to skip past it - // and find the next breakpoint. - assert(*breakpoint == lex_mode->as.regexp.incrementor); - parser->current.end = breakpoint + 1; - breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); - lex_mode->as.regexp.nesting++; - continue; + LEX(type); + } + default: + assert(false && "unreachable"); + break; + } } if (parser->current.end > parser->current.start) { diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 649d05b874..2aee473ddf 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -344,7 +344,7 @@ module Prism next if ["/", "{"].include?(character) source = "# encoding: #{name}\n/(?##{character})/\n" - assert Prism.parse(source).success? + assert Prism.parse(source).success?, "Expected #{source.inspect} to parse successfully." end rescue RangeError source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}" |
