diff options
author | Kevin Newton <kddnewton@gmail.com> | 2024-03-08 10:12:19 -0500 |
---|---|---|
committer | git <svn-admin@ruby-lang.org> | 2024-03-08 18:48:55 +0000 |
commit | 609bbad15da6fe91904bdcd139f9e24e3cf61d4b (patch) | |
tree | 802afc4c64b721525b08945e81f7373608bfc1c3 | |
parent | a564f30fb8a16d7d32803eb11630ef8d6d762c53 (diff) |
[ruby/prism] Fix up regexp memory leaks
https://github.com/ruby/prism/commit/4dc58a533a
-rw-r--r-- | prism/prism.c | 64 | ||||
-rw-r--r-- | prism/templates/src/diagnostic.c.erb | 8 | ||||
-rw-r--r-- | test/prism/unescape_test.rb | 2 |
3 files changed, 44 insertions, 30 deletions
diff --git a/prism/prism.c b/prism/prism.c index 9bd18043e4..903c7511de 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5949,7 +5949,7 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { return 0; } -static inline pm_node_flags_t +static pm_node_flags_t parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || @@ -5974,7 +5974,7 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); if (modifier == 'n' && !ascii_only) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source)); } } @@ -5985,18 +5985,18 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con bool mixed_encoding = false; if (mixed_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily. bool valid_string_in_modifier_encoding = true; if (!valid_string_in_modifier_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); } } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now. if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source)); } } @@ -6010,13 +6010,12 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding * may be explicitly set with an escape sequence. */ -static inline pm_node_flags_t +static pm_node_flags_t parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) { // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. bool valid_unicode_range = true; if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source)); - + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source)); return flags; } @@ -17143,13 +17142,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); - pm_node_t *regular_expression_node = (pm_node_t *) (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); - pm_node_flag_set(regular_expression_node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); + pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); + pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); - return regular_expression_node; + return node; } - pm_interpolated_regular_expression_node_t *node; + pm_interpolated_regular_expression_node_t *interpolated; if (match1(parser, PM_TOKEN_STRING_CONTENT)) { // In this case we've hit string content so we know the regular @@ -17157,40 +17156,57 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // following token is the end (in which case we can return a plain // regular expression) or if it's not then it has interpolation. pm_string_t unescaped = parser->current_string; - pm_string_t *source = &parser->current_regular_expression_source; pm_token_t content = parser->current; + + pm_string_t source = parser->current_regular_expression_source; + pm_string_constant_init(&parser->current_regular_expression_source, "", 0); + parser_lex(parser); // If we hit an end, then we can create a regular expression node // without interpolation, which can be represented more succinctly and // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { - pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, source); - pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding(parser, source, &unescaped, regular_expression_node->flags)); - return regular_expression_node; + pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source); + pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, node->flags)); + pm_string_free(&unescaped); + return node; } // If we get here, then we have interpolation so we'll need to create // a regular expression node with interpolation. - node = pm_interpolated_regular_expression_node_create(parser, &opening); + interpolated = pm_interpolated_regular_expression_node_create(parser, &opening); pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); - pm_interpolated_regular_expression_node_append(node, part); + pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &source); + pm_interpolated_regular_expression_node_append(interpolated, part); + pm_string_free(&unescaped); } else { // If the first part of the body of the regular expression is not a // string content, then we have interpolation and we need to create an // interpolated regular expression node. - node = pm_interpolated_regular_expression_node_create(parser, &opening); + interpolated = pm_interpolated_regular_expression_node_create(parser, &opening); } // Now that we're here and we have interpolation, we'll parse all of the // parts into the list. pm_node_t *part; while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { - if ((part = parse_string_part(parser)) != NULL) { - pm_interpolated_regular_expression_node_append(node, part); + if (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_token_t opening = not_provided(parser); + pm_token_t closing = not_provided(parser); + + pm_node_t *node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->current, &closing, &parser->current_regular_expression_source); + pm_node_flag_set(node, parse_unescaped_encoding(parser)); + + pm_string_free(&parser->current_string); + pm_string_constant_init(&parser->current_regular_expression_source, "", 0); + + parser_lex(parser); + pm_interpolated_regular_expression_node_append(interpolated, node); + } else if ((part = parse_string_part(parser)) != NULL) { + pm_interpolated_regular_expression_node_append(interpolated, part); } } @@ -17201,9 +17217,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } else { expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM); } - pm_interpolated_regular_expression_node_closing_set(node, &closing); - return (pm_node_t *) node; + pm_interpolated_regular_expression_node_closing_set(interpolated, &closing); + return (pm_node_t *) interpolated; } case PM_TOKEN_BACKTICK: case PM_TOKEN_PERCENT_LOWER_X: { diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index 06681dda91..64e1a1d742 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -206,7 +206,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_INVALID_CHARACTER] = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_MULTIBYTE_CHAR] = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_MULTIBYTE_CHARACTER] = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL }, - [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%.*s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_PRINTABLE_CHARACTER] = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_PERCENT] = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT? [PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_FATAL }, @@ -273,9 +273,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_FATAL }, - [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%s/", PM_ERROR_LEVEL_FATAL }, - [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%s/", PM_ERROR_LEVEL_FATAL }, - [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL }, diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb index 72ad780d8b..2a352c5234 100644 --- a/test/prism/unescape_test.rb +++ b/test/prism/unescape_test.rb @@ -230,8 +230,6 @@ module Prism else assert_equal expected.bytes, actual.bytes, message end - rescue Exception - binding.irb end end end |