summaryrefslogtreecommitdiff
path: root/prism
diff options
context:
space:
mode:
authorKevin Menard <kevin@nirvdrum.com>2024-01-24 16:39:06 -0500
committergit <svn-admin@ruby-lang.org>2024-01-26 20:15:19 +0000
commit2a509787cb8869301b614139218432aef9b68f9b (patch)
tree93c7e278fd5124be8697581b1a53d78b923ffed7 /prism
parent3d996e827f2ff74a1bb7e978d754cea7d957b9eb (diff)
[ruby/prism] Track whether a Symbol should have its encoding changed from the source encoding.
Ruby sets a Symbol literal's encoding to US-ASCII if the symbols consists only of US ASCII code points. Character escapes can also lead a Symbol to have a different encoding than its source's encoding. https://github.com/ruby/prism/commit/f315660b31
Diffstat (limited to 'prism')
-rw-r--r--prism/prism.c69
1 files changed, 65 insertions, 4 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 107b284e1d..a68577b4dc 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -5472,6 +5472,53 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
}
/**
+ * Read through the contents of a string and check if it consists solely of US ASCII code points.
+ */
+static bool
+ascii_only_p( const pm_string_t *contents) {
+ const size_t length = contents->length;
+
+ for (size_t i = 0; i < length; i++) {
+ if (contents->source[i] & 0x80) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated encoding is ASCII-compatible and
+ * the Symbol consists only of US-ASCII code points. Otherwise, the encoding may be explicitly set with an
+ * escape sequence.
+ */
+static inline pm_node_flags_t
+parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
+ // Don't set any flags on the Symbol if it hasn't been populated yet.
+ if (contents->source == NULL) {
+ return 0;
+ }
+
+ // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all symbols appearing in
+ // source are eligible for "downgrading" to US-ASCII.
+ if (ascii_only_p(contents)) {
+ return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
+ } else {
+ // A Symbol may optionally have its encoding explicitly set.
+ //
+ // NB: an explicitly set encoding is ignored by Ruby if the Symbol consists of only US ASCII code points.
+ if (parser->explicit_encoding != NULL) {
+ if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+ return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
+ } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+ return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
* Allocate and initialize a new SymbolNode node with the given unescaped
* string.
*/
@@ -5494,6 +5541,8 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
.unescaped = *unescaped
};
+ pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, unescaped));
+
return node;
}
@@ -5532,6 +5581,7 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
assert((label.end - label.start) >= 0);
pm_string_shared_init(&node->unescaped, label.start, label.end);
+ pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, &node->unescaped));
break;
}
case PM_TOKEN_MISSING: {
@@ -5594,6 +5644,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
.unescaped = node->unescaped
};
+ pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
+
// We are explicitly _not_ using pm_node_destroy here because we don't want
// to trash the unescaped string. We could instead copy the string if we
// know that it is owned, but we're taking the fast path for now.
@@ -8115,7 +8167,6 @@ pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
/**
* When we're about to return from lexing the current token and we know for sure
* that we have found an escape sequence, this function is called to copy the
- *
* contents of the token buffer into the current string on the parser so that it
* can be attached to the correct node.
*/
@@ -8130,7 +8181,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
* string. If we haven't pushed anything into the buffer, this means that we
* never found an escape sequence, so we can directly reference the bounds of
* the current string. Either way, at the return of this function it is expected
- *
* that parser->current_string is established in such a way that it can be
* attached to a node.
*/
@@ -8149,7 +8199,6 @@ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
* point into the buffer because we're about to provide a string that has
* different content than a direct slice of the source.
*
- *
* It is expected that the parser's current token end will be pointing at one
* byte past the backslash that starts the escape sequence.
*/
@@ -12595,8 +12644,11 @@ PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int
static inline pm_node_flags_t
parse_unescaped_encoding(const pm_parser_t *parser) {
if (parser->explicit_encoding != NULL) {
+ // If the there's an explicit encoding and it's using a UTF-8 escape sequence, then mark the string as UTF-8.
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
+ // If there's a non-UTF-8 escape sequence being used, then the string uses the source encoding, unless the source
+ // is marked as US-ASCII. In that case the string is forced as ASCII-8BIT in order to keep the string valid.
} else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
}
@@ -12749,6 +12801,7 @@ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_sta
parser_lex(parser);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, end);
+ pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
return (pm_node_t *) symbol;
}
@@ -12787,6 +12840,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
+ pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
+
return (pm_node_t *) symbol;
}
@@ -12872,6 +12927,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
} else {
content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end };
pm_string_shared_init(&unescaped, content.start, content.end);
+
}
if (next_state != PM_LEX_STATE_NONE) {
@@ -12883,7 +12939,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
} else {
expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
}
- return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+
+ pm_symbol_node_t *symbol_node = pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+ pm_node_flag_set((pm_node_t *)symbol_node, parse_symbol_encoding(parser, &symbol_node->unescaped));
+
+ return (pm_node_t *) symbol_node;
}
/**
@@ -12947,6 +13007,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
+ pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
return (pm_node_t *) symbol;
}
case PM_TOKEN_SYMBOL_BEGIN: {