summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-05-01 12:35:28 -0400
committergit <svn-admin@ruby-lang.org>2024-05-01 16:43:05 +0000
commit41f8ae1ffdca349d0301b480c690e83a1dd1e723 (patch)
tree30f5839d7d1b3bf99e0260929c4196bf72b99d90
parent0fa09c5729fab6b5eb084a01d582587faf6405a3 (diff)
[ruby/prism] Mark errors for invalid symbols
https://github.com/ruby/prism/commit/661884c4a3
-rw-r--r--prism/config.yml1
-rw-r--r--prism/prism.c69
-rw-r--r--prism/templates/src/diagnostic.c.erb1
-rw-r--r--test/prism/unescape_test.rb2
4 files changed, 61 insertions, 12 deletions
diff --git a/prism/config.yml b/prism/config.yml
index 56c95e4e55..6e6e3d1e95 100644
--- a/prism/config.yml
+++ b/prism/config.yml
@@ -149,6 +149,7 @@ errors:
- INVALID_RETRY_AFTER_ELSE
- INVALID_RETRY_AFTER_ENSURE
- INVALID_RETRY_WITHOUT_RESCUE
+ - INVALID_SYMBOL
- INVALID_VARIABLE_GLOBAL
- INVALID_VARIABLE_GLOBAL_3_3_0
- INVALID_YIELD
diff --git a/prism/prism.c b/prism/prism.c
index 24812ca483..1d4f268631 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6942,7 +6942,8 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
}
/**
- * Read through the contents of a string and check if it consists solely of US ASCII code points.
+ * Read through the contents of a string and check if it consists solely of
+ * US-ASCII code points.
*/
static bool
pm_ascii_only_p(const pm_string_t *contents) {
@@ -6957,26 +6958,71 @@ pm_ascii_only_p(const pm_string_t *contents) {
}
/**
+ * Validate that the contents of the given symbol are all valid UTF-8.
+ */
+static void
+parse_symbol_encoding_validate_utf8(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
+ for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
+ size_t width = pm_encoding_utf_8_char_width(cursor, end - cursor);
+
+ if (width == 0) {
+ pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
+ break;
+ }
+
+ cursor += width;
+ }
+}
+
+/**
+ * Validate that the contents of the given symbol are all valid in the encoding
+ * of the parser.
+ */
+static void
+parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents) {
+ const pm_encoding_t *encoding = parser->encoding;
+
+ for (const uint8_t *cursor = pm_string_source(contents), *end = cursor + pm_string_length(contents); cursor < end;) {
+ size_t width = encoding->char_width(cursor, end - cursor);
+
+ if (width == 0) {
+ pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL);
+ break;
+ }
+
+ cursor += width;
+ }
+}
+
+/**
* Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated
* encoding is ASCII-compatible and the Symbol consists only of US-ASCII code
* points. Otherwise, the encoding may be explicitly set with an escape
* sequence.
+ *
+ * If the validate flag is set, then it will check the contents of the symbol
+ * to ensure that all characters are valid in the encoding.
*/
static inline pm_node_flags_t
-parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
+parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents, bool validate) {
if (parser->explicit_encoding != NULL) {
// A Symbol may optionally have its encoding explicitly set. This will
// happen if an escape sequence results in a non-ASCII code point.
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+ if (validate) parse_symbol_encoding_validate_utf8(parser, location, contents);
return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
+ } else if (validate) {
+ parse_symbol_encoding_validate_other(parser, location, contents);
}
} else if (pm_ascii_only_p(contents)) {
// Ruby stipulates that all source files must use an ASCII-compatible
// encoding. Thus, all symbols appearing in source are eligible for
// "downgrading" to US-ASCII.
return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
+ } else if (validate) {
+ parse_symbol_encoding_validate_other(parser, location, contents);
}
return 0;
@@ -7144,7 +7190,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t
*/
static pm_symbol_node_t *
pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) {
- pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string));
+ pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, value, &parser->current_string, false));
parser->current_string = PM_STRING_EMPTY;
return node;
}
@@ -7166,7 +7212,7 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
assert((label.end - label.start) >= 0);
pm_string_shared_init(&node->unescaped, label.start, label.end);
- pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped));
+ pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &label, &node->unescaped, false));
break;
}
@@ -7251,7 +7297,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
.unescaped = node->unescaped
};
- pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
+ pm_token_t content = { .type = PM_TOKEN_IDENTIFIER, .start = node->content_loc.start, .end = node->content_loc.end };
+ pm_node_flag_set((pm_node_t *) new_node, parse_symbol_encoding(parser, &content, &node->unescaped, true));
// We are explicitly _not_ using pm_node_destroy here because we don't want
// to trash the unescaped string. We could instead copy the string if we
@@ -15259,7 +15306,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
return (pm_node_t *) symbol;
}
@@ -15359,7 +15406,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
}
- return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
+ return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, false));
}
/**
@@ -15384,7 +15431,7 @@ parse_undef_argument(pm_parser_t *parser) {
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
return (pm_node_t *) symbol;
}
@@ -15425,7 +15472,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
- pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped));
+ pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false));
return (pm_node_t *) symbol;
}
@@ -16590,7 +16637,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
pm_node_list_free(&parts);
} else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) {
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true));
} else if (match1(parser, PM_TOKEN_EOF)) {
pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF);
node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
@@ -16616,7 +16663,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
pm_node_flag_set(node, parse_unescaped_encoding(parser));
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF);
} else if (accept1(parser, PM_TOKEN_LABEL_END)) {
- node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped));
+ node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true));
} else {
// If we get here, then we have interpolation so we'll need
// to create a string or symbol node with interpolation.
diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb
index 839013714b..02035fec24 100644
--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@@ -232,6 +232,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_INVALID_RETRY_AFTER_ELSE] = { "Invalid retry after else", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_INVALID_RETRY_AFTER_ENSURE] = { "Invalid retry after ensure", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_INVALID_RETRY_WITHOUT_RESCUE] = { "Invalid retry without rescue", PM_ERROR_LEVEL_SYNTAX },
+ [PM_ERR_INVALID_SYMBOL] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_INVALID_VARIABLE_GLOBAL] = { "'%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_INVALID_YIELD] = { "Invalid yield", PM_ERROR_LEVEL_SYNTAX },
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
index 2a352c5234..3f78a59b11 100644
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@@ -38,7 +38,7 @@ module Prism
end
def prism(escape)
- result = Prism.parse(code(escape))
+ result = Prism.parse(code(escape), encoding: "binary")
if result.success?
yield result.value.statements.body.first