diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2023-08-03 16:43:36 -0400 |
|---|---|---|
| committer | Takashi Kokubun <takashikkbn@gmail.com> | 2023-08-16 17:47:32 -0700 |
| commit | a793260c159e991998efb6975b0fdff92d22c806 (patch) | |
| tree | 318549e2126d4883651aa57fbf122365d8ffbcb8 | |
| parent | 2ef54d38556c1c6bc95704fcb0dfa6a99041404d (diff) | |
[ruby/yarp] Also rework regexp lexer to check terminators properly
https://github.com/ruby/yarp/commit/4b157a8352
Notes
Notes:
Merged: https://github.com/ruby/ruby/pull/8226
| -rw-r--r-- | yarp/yarp.c | 149 |
1 files changed, 71 insertions, 78 deletions
diff --git a/yarp/yarp.c b/yarp/yarp.c index e45c43a11e..bb4db37d7f 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -6722,105 +6722,98 @@ parser_lex(yp_parser_t *parser) { } // Get a reference to the current mode. - yp_lex_mode_t *mode = parser->lex_modes.current; + yp_lex_mode_t *lex_mode = parser->lex_modes.current; // These are the places where we need to split up the content of the // regular expression. We'll use strpbrk to find the first of these // characters. - const char *breakpoints = mode->as.regexp.breakpoints; + const char *breakpoints = lex_mode->as.regexp.breakpoints; const char *breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); while (breakpoint != NULL) { - switch (*breakpoint) { - case '\0': - // If we hit a null byte, skip directly past it. - breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); - break; - case '\\': { - // If we hit escapes, then we need to treat the next token - // literally. In this case we'll skip past the next character and - // find the next breakpoint. - size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list); + // If we hit a null byte, skip directly past it. + if (*breakpoint == '\0') { + breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); + continue; + } - // If the result is an escaped newline, then we need to - // track that newline. - if (breakpoint[difference - 1] == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + // If we've hit a newline, then we need to track that in the + // list of newlines. + if (*breakpoint == '\n') { + yp_newline_list_append(&parser->newline_list, breakpoint); - breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); - break; + if (lex_mode->as.regexp.terminator != '\n') { + // If the terminator is not a newline, then we can set + // the next breakpoint and continue. + breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); + continue; } - case '#': { - // If the terminator is #, then we need to fall into the - // default case. Otherwise we'll attempt to lex - // interpolation. - if (mode->as.regexp.terminator != '#') { - yp_token_type_t type = lex_interpolation(parser, breakpoint); - if (type != YP_TOKEN_NOT_PROVIDED) { - LEX(type); - } + } - // If we haven't returned at this point then we had something - // that looked like an interpolated class or instance variable - // like "#@" but wasn't actually. In this case we'll just skip - // to the next breakpoint. - breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); - break; - } + // If we hit the terminator, we need to determine what kind of + // token to return. + if (*breakpoint == lex_mode->as.regexp.terminator) { + if (lex_mode->as.regexp.nesting > 0) { + breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); + lex_mode->as.regexp.nesting--; + continue; } - /* fallthrough */ - default: { - if (*breakpoint == mode->as.regexp.incrementor) { - // If we've hit the incrementor, then we need to skip past it and - // find the next breakpoint. - breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); - mode->as.regexp.nesting++; - break; - } - - if (*breakpoint == '\n') { - // If we've hit a newline, then we need to track - // that in the list of newlines. - yp_newline_list_append(&parser->newline_list, breakpoint); - if (mode->as.regexp.terminator != '\n') { - // If the terminator is not a newline, then we - // can set the next breakpoint and continue. - breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); - break; - } + // Here we've hit the terminator. If we have already consumed + // content then we need to return that content as string content + // first. + if (breakpoint > parser->current.start) { + parser->current.end = breakpoint; + LEX(YP_TOKEN_STRING_CONTENT); + } - // Otherwise, the newline character is the - // terminator so we need to continue on. - } + // Since we've hit the terminator of the regular expression, we now + // need to parse the options. + parser->current.end = breakpoint + 1; + parser->current.end += yp_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); - assert(*breakpoint == mode->as.regexp.terminator); + lex_mode_pop(parser); + lex_state_set(parser, YP_LEX_STATE_END); + LEX(YP_TOKEN_REGEXP_END); + } - if (mode->as.regexp.nesting > 0) { - breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); - mode->as.regexp.nesting--; - break; - } + // If we hit escapes, then we need to treat the next token + // literally. In this case we'll skip past the next character + // and find the next breakpoint. + if (*breakpoint == '\\') { + size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, YP_UNESCAPE_ALL, false, &parser->error_list); - // Here we've hit the terminator. If we have already consumed - // content then we need to return that content as string content - // first. - if (breakpoint > parser->current.start) { - parser->current.end = breakpoint; - LEX(YP_TOKEN_STRING_CONTENT); - } + // If the result is an escaped newline, then we need to + // track that newline. + if (breakpoint[difference - 1] == '\n') { + yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); + } - // Since we've hit the terminator of the regular expression, we now - // need to parse the options. - parser->current.end = breakpoint + 1; - parser->current.end += yp_strspn_regexp_option(parser->current.end, parser->end - parser->current.end); + breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); + continue; + } - lex_mode_pop(parser); - lex_state_set(parser, YP_LEX_STATE_END); - LEX(YP_TOKEN_REGEXP_END); + // If we hit a #, then we will attempt to lex interpolation. + if (*breakpoint == '#') { + yp_token_type_t type = lex_interpolation(parser, breakpoint); + if (type != YP_TOKEN_NOT_PROVIDED) { + LEX(type); } + + // If we haven't returned at this point then we had + // something that looked like an interpolated class or + // instance variable like "#@" but wasn't actually. In this + // case we'll just skip to the next breakpoint. + breakpoint = yp_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end); + continue; } + + // If we've hit the incrementor, then we need to skip past it + // and find the next breakpoint. + assert(*breakpoint == lex_mode->as.regexp.incrementor); + breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); + lex_mode->as.regexp.nesting++; + continue; } // At this point, the breakpoint is NULL which means we were unable to |
