summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-03-08 10:12:19 -0500
committergit <svn-admin@ruby-lang.org>2024-03-08 18:48:55 +0000
commit609bbad15da6fe91904bdcd139f9e24e3cf61d4b (patch)
tree802afc4c64b721525b08945e81f7373608bfc1c3
parenta564f30fb8a16d7d32803eb11630ef8d6d762c53 (diff)
[ruby/prism] Fix up regexp memory leaks
https://github.com/ruby/prism/commit/4dc58a533a
-rw-r--r--prism/prism.c64
-rw-r--r--prism/templates/src/diagnostic.c.erb8
-rw-r--r--test/prism/unescape_test.rb2
3 files changed, 44 insertions, 30 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 9bd18043e4..903c7511de 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -5949,7 +5949,7 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
return 0;
}
-static inline pm_node_flags_t
+static pm_node_flags_t
parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
(modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
@@ -5974,7 +5974,7 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
if (modifier == 'n' && !ascii_only) {
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source));
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source));
}
}
@@ -5985,18 +5985,18 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con
bool mixed_encoding = false;
if (mixed_encoding) {
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source));
} else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
// TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily.
bool valid_string_in_modifier_encoding = true;
if (!valid_string_in_modifier_encoding) {
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source));
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source));
}
} else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
// TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now.
if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source));
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source));
}
}
@@ -6010,13 +6010,12 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con
* when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding
* may be explicitly set with an escape sequence.
*/
-static inline pm_node_flags_t
+static pm_node_flags_t
parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) {
// TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
bool valid_unicode_range = true;
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
- PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source));
-
+ PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source));
return flags;
}
@@ -17143,13 +17142,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
parser_lex(parser);
- pm_node_t *regular_expression_node = (pm_node_t *) (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
- pm_node_flag_set(regular_expression_node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING);
+ pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous);
+ pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING);
- return regular_expression_node;
+ return node;
}
- pm_interpolated_regular_expression_node_t *node;
+ pm_interpolated_regular_expression_node_t *interpolated;
if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
// In this case we've hit string content so we know the regular
@@ -17157,40 +17156,57 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
// following token is the end (in which case we can return a plain
// regular expression) or if it's not then it has interpolation.
pm_string_t unescaped = parser->current_string;
- pm_string_t *source = &parser->current_regular_expression_source;
pm_token_t content = parser->current;
+
+ pm_string_t source = parser->current_regular_expression_source;
+ pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
+
parser_lex(parser);
// If we hit an end, then we can create a regular expression node
// without interpolation, which can be represented more succinctly and
// more easily compiled.
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
- pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, source);
- pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding(parser, source, &unescaped, regular_expression_node->flags));
- return regular_expression_node;
+ pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
+ pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, node->flags));
+ pm_string_free(&unescaped);
+ return node;
}
// If we get here, then we have interpolation so we'll need to create
// a regular expression node with interpolation.
- node = pm_interpolated_regular_expression_node_create(parser, &opening);
+ interpolated = pm_interpolated_regular_expression_node_create(parser, &opening);
pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser);
- pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
- pm_interpolated_regular_expression_node_append(node, part);
+ pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &source);
+ pm_interpolated_regular_expression_node_append(interpolated, part);
+ pm_string_free(&unescaped);
} else {
// If the first part of the body of the regular expression is not a
// string content, then we have interpolation and we need to create an
// interpolated regular expression node.
- node = pm_interpolated_regular_expression_node_create(parser, &opening);
+ interpolated = pm_interpolated_regular_expression_node_create(parser, &opening);
}
// Now that we're here and we have interpolation, we'll parse all of the
// parts into the list.
pm_node_t *part;
while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) {
- if ((part = parse_string_part(parser)) != NULL) {
- pm_interpolated_regular_expression_node_append(node, part);
+ if (match1(parser, PM_TOKEN_STRING_CONTENT)) {
+ pm_token_t opening = not_provided(parser);
+ pm_token_t closing = not_provided(parser);
+
+ pm_node_t *node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->current, &closing, &parser->current_regular_expression_source);
+ pm_node_flag_set(node, parse_unescaped_encoding(parser));
+
+ pm_string_free(&parser->current_string);
+ pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
+
+ parser_lex(parser);
+ pm_interpolated_regular_expression_node_append(interpolated, node);
+ } else if ((part = parse_string_part(parser)) != NULL) {
+ pm_interpolated_regular_expression_node_append(interpolated, part);
}
}
@@ -17201,9 +17217,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
} else {
expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM);
}
- pm_interpolated_regular_expression_node_closing_set(node, &closing);
- return (pm_node_t *) node;
+ pm_interpolated_regular_expression_node_closing_set(interpolated, &closing);
+ return (pm_node_t *) interpolated;
}
case PM_TOKEN_BACKTICK:
case PM_TOKEN_PERCENT_LOWER_X: {
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
index 06681dda91..64e1a1d742 100644
--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@@ -206,7 +206,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_INVALID_CHARACTER] = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL },
[PM_ERR_INVALID_MULTIBYTE_CHAR] = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_FATAL },
[PM_ERR_INVALID_MULTIBYTE_CHARACTER] = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL },
- [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%s/", PM_ERROR_LEVEL_FATAL },
+ [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%.*s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_INVALID_PRINTABLE_CHARACTER] = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL },
[PM_ERR_INVALID_PERCENT] = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT?
[PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_FATAL },
@@ -273,9 +273,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_FATAL },
[PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_FATAL },
- [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%s/", PM_ERROR_LEVEL_FATAL },
- [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%s/", PM_ERROR_LEVEL_FATAL },
- [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%s/", PM_ERROR_LEVEL_FATAL },
+ [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_FATAL },
+ [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_FATAL },
+ [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL },
[PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL },
[PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL },
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
index 72ad780d8b..2a352c5234 100644
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@@ -230,8 +230,6 @@ module Prism
else
assert_equal expected.bytes, actual.bytes, message
end
- rescue Exception
- binding.irb
end
end
end