summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-03-25 09:24:35 -0400
committerKevin Newton <kddnewton@gmail.com>2024-03-25 11:52:12 -0400
commit86077fbcde05f4abd6b306ad0fcc88ee891f8e8b (patch)
tree922cc80305c88e318e8488261dae34392d7feeb7
parent0bc764b7298fe79ded630841f0781979a9d0d806 (diff)
[ruby/prism] Refactor regexp lexing to make it easier to support CLRF
https://github.com/ruby/prism/commit/60805d85ca
-rw-r--r--prism/prism.c239
-rw-r--r--test/prism/encoding_test.rb2
2 files changed, 120 insertions, 121 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 9f0ecdb938..6aa611624a 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -10783,36 +10783,6 @@ parser_lex(pm_parser_t *parser) {
pm_regexp_token_buffer_t token_buffer = { 0 };
while (breakpoint != NULL) {
- // If we hit a null byte, skip directly past it.
- if (*breakpoint == '\0') {
- parser->current.end = breakpoint + 1;
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
- continue;
- }
-
- // If we've hit a newline, then we need to track that in the
- // list of newlines.
- if (*breakpoint == '\n') {
- // For the special case of a newline-terminated regular expression, we will pass
- // through this branch twice -- once with PM_TOKEN_REGEXP_BEGIN and then again
- // with PM_TOKEN_STRING_CONTENT. Let's avoid tracking the newline twice, by
- // tracking it only in the REGEXP_BEGIN case.
- if (
- !(lex_mode->as.regexp.terminator == '\n' && parser->current.type != PM_TOKEN_REGEXP_BEGIN)
- && parser->heredoc_end == NULL
- ) {
- pm_newline_list_append(&parser->newline_list, breakpoint);
- }
-
- if (lex_mode->as.regexp.terminator != '\n') {
- // If the terminator is not a newline, then we can set
- // the next breakpoint and continue.
- parser->current.end = breakpoint + 1;
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
- continue;
- }
- }
-
// If we hit the terminator, we need to determine what kind of
// token to return.
if (*breakpoint == lex_mode->as.regexp.terminator) {
@@ -10832,9 +10802,17 @@ parser_lex(pm_parser_t *parser) {
LEX(PM_TOKEN_STRING_CONTENT);
}
+ // Check here if we need to track the newline.
+ size_t eol_length = match_eol_at(parser, breakpoint);
+ if (eol_length) {
+ parser->current.end = breakpoint + eol_length;
+ pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
+ } else {
+ parser->current.end = breakpoint + 1;
+ }
+
// Since we've hit the terminator of the regular expression,
// we now need to parse the options.
- parser->current.end = breakpoint + 1;
parser->current.end += pm_strspn_regexp_option(parser->current.end, parser->end - parser->current.end);
lex_mode_pop(parser);
@@ -10842,114 +10820,135 @@ parser_lex(pm_parser_t *parser) {
LEX(PM_TOKEN_REGEXP_END);
}
- // If we hit escapes, then we need to treat the next token
- // literally. In this case we'll skip past the next character
+ // If we've hit the incrementor, then we need to skip past it
// and find the next breakpoint.
- if (*breakpoint == '\\') {
+ if (*breakpoint && *breakpoint == lex_mode->as.regexp.incrementor) {
parser->current.end = breakpoint + 1;
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+ lex_mode->as.regexp.nesting++;
+ continue;
+ }
- // If we've hit the end of the file, then break out of the
- // loop by setting the breakpoint to NULL.
- if (parser->current.end == parser->end) {
- breakpoint = NULL;
- continue;
- }
-
- pm_regexp_token_buffer_escape(parser, &token_buffer);
- uint8_t peeked = peek(parser);
+ switch (*breakpoint) {
+ case '\0':
+ // If we hit a null byte, skip directly past it.
+ parser->current.end = breakpoint + 1;
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+ break;
+ case '\n':
+ // If we've hit a newline, then we need to track that in
+ // the list of newlines.
+ if (parser->heredoc_end == NULL) {
+ pm_newline_list_append(&parser->newline_list, breakpoint);
+ }
- switch (peeked) {
- case '\r':
- parser->current.end++;
- if (peek(parser) != '\n') {
- if (lex_mode->as.regexp.terminator != '\r') {
- pm_token_buffer_push_byte(&token_buffer.base, '\\');
- }
- pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
- pm_token_buffer_push_byte(&token_buffer.base, '\r');
- break;
- }
- /* fallthrough */
- case '\n':
- if (parser->heredoc_end) {
- // ... if we are on the same line as a heredoc,
- // flush the heredoc and continue parsing after
- // heredoc_end.
- parser_flush_heredoc_end(parser);
- pm_regexp_token_buffer_copy(parser, &token_buffer);
- LEX(PM_TOKEN_STRING_CONTENT);
- } else {
- // ... else track the newline.
- pm_newline_list_append(&parser->newline_list, parser->current.end);
- }
+ parser->current.end = breakpoint + 1;
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+ break;
+ case '\\': {
+ // If we hit escapes, then we need to treat the next
+ // token literally. In this case we'll skip past the
+ // next character and find the next breakpoint.
+ parser->current.end = breakpoint + 1;
- parser->current.end++;
- break;
- case 'c':
- case 'C':
- case 'M':
- case 'u':
- case 'x':
- escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP);
+ // If we've hit the end of the file, then break out of
+ // the loop by setting the breakpoint to NULL.
+ if (parser->current.end == parser->end) {
+ breakpoint = NULL;
break;
- default:
- if (lex_mode->as.regexp.terminator == peeked) {
- // Some characters when they are used as the
- // terminator also receive an escape. They are
- // enumerated here.
- switch (peeked) {
- case '$': case ')': case '*': case '+':
- case '.': case '>': case '?': case ']':
- case '^': case '|': case '}':
+ }
+
+ pm_regexp_token_buffer_escape(parser, &token_buffer);
+ uint8_t peeked = peek(parser);
+
+ switch (peeked) {
+ case '\r':
+ parser->current.end++;
+ if (peek(parser) != '\n') {
+ if (lex_mode->as.regexp.terminator != '\r') {
pm_token_buffer_push_byte(&token_buffer.base, '\\');
- break;
- default:
- break;
+ }
+ pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
+ pm_token_buffer_push_byte(&token_buffer.base, '\r');
+ break;
+ }
+ /* fallthrough */
+ case '\n':
+ if (parser->heredoc_end) {
+ // ... if we are on the same line as a heredoc,
+ // flush the heredoc and continue parsing after
+ // heredoc_end.
+ parser_flush_heredoc_end(parser);
+ pm_regexp_token_buffer_copy(parser, &token_buffer);
+ LEX(PM_TOKEN_STRING_CONTENT);
+ } else {
+ // ... else track the newline.
+ pm_newline_list_append(&parser->newline_list, parser->current.end);
}
- pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
- pm_token_buffer_push_byte(&token_buffer.base, peeked);
parser->current.end++;
break;
- }
-
- if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\');
- pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
- break;
- }
+ case 'c':
+ case 'C':
+ case 'M':
+ case 'u':
+ case 'x':
+ escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP);
+ break;
+ default:
+ if (lex_mode->as.regexp.terminator == peeked) {
+ // Some characters when they are used as the
+ // terminator also receive an escape. They are
+ // enumerated here.
+ switch (peeked) {
+ case '$': case ')': case '*': case '+':
+ case '.': case '>': case '?': case ']':
+ case '^': case '|': case '}':
+ pm_token_buffer_push_byte(&token_buffer.base, '\\');
+ break;
+ default:
+ break;
+ }
- token_buffer.base.cursor = parser->current.end;
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
- continue;
- }
+ pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
+ pm_token_buffer_push_byte(&token_buffer.base, peeked);
+ parser->current.end++;
+ break;
+ }
- // If we hit a #, then we will attempt to lex interpolation.
- if (*breakpoint == '#') {
- pm_token_type_t type = lex_interpolation(parser, breakpoint);
+ if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\');
+ pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
+ break;
+ }
- if (type == PM_TOKEN_NOT_PROVIDED) {
- // If we haven't returned at this point then we had
- // something that looked like an interpolated class or
- // instance variable like "#@" but wasn't actually. In
- // this case we'll just skip to the next breakpoint.
+ token_buffer.base.cursor = parser->current.end;
breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
- continue;
+ break;
}
+ case '#': {
+ // If we hit a #, then we will attempt to lex
+ // interpolation.
+ pm_token_type_t type = lex_interpolation(parser, breakpoint);
- if (type == PM_TOKEN_STRING_CONTENT) {
- pm_regexp_token_buffer_flush(parser, &token_buffer);
- }
+ if (type == PM_TOKEN_NOT_PROVIDED) {
+ // If we haven't returned at this point then we had
+ // something that looked like an interpolated class or
+ // instance variable like "#@" but wasn't actually. In
+ // this case we'll just skip to the next breakpoint.
+ breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
+ break;
+ }
- LEX(type);
- }
+ if (type == PM_TOKEN_STRING_CONTENT) {
+ pm_regexp_token_buffer_flush(parser, &token_buffer);
+ }
- // If we've hit the incrementor, then we need to skip past it
- // and find the next breakpoint.
- assert(*breakpoint == lex_mode->as.regexp.incrementor);
- parser->current.end = breakpoint + 1;
- breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
- lex_mode->as.regexp.nesting++;
- continue;
+ LEX(type);
+ }
+ default:
+ assert(false && "unreachable");
+ break;
+ }
}
if (parser->current.end > parser->current.start) {
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 649d05b874..2aee473ddf 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -344,7 +344,7 @@ module Prism
next if ["/", "{"].include?(character)
source = "# encoding: #{name}\n/(?##{character})/\n"
- assert Prism.parse(source).success?
+ assert Prism.parse(source).success?, "Expected #{source.inspect} to parse successfully."
end
rescue RangeError
source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"