[ruby/prism] Track whether a Symbol should have its encoding changed from the source encoding.

Ruby sets a Symbol literal's encoding to US-ASCII if the symbols consists only of US ASCII code points. Character escapes can also lead a Symbol to have a different encoding than its source's encoding. https://github.com/ruby/prism/commit/f315660b31
author: Kevin Menard <kevin@nirvdrum.com> 2024-01-24 16:39:06 -0500
committer: git <svn-admin@ruby-lang.org> 2024-01-26 20:15:19 +0000
commit: 2a509787cb8869301b614139218432aef9b68f9b (patch)
tree: 93c7e278fd5124be8697581b1a53d78b923ffed7 /prism
parent: 3d996e827f2ff74a1bb7e978d754cea7d957b9eb (diff)
1 files changed, 65 insertions, 4 deletions
diff --git a/prism/prism.c b/prism/prism.c
index 107b284e1d..a68577b4dc 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -5472,6 +5472,53 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument
 }
 
 /**
+ * Read through the contents of a string and check if it consists solely of US ASCII code points.
+ */
+static bool
+ascii_only_p( const pm_string_t *contents) {
+    const size_t length = contents->length;
+
+    for (size_t i = 0; i < length; i++) {
+        if (contents->source[i] & 0x80) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/**
+ * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated encoding is ASCII-compatible and
+ * the Symbol consists only of US-ASCII code points. Otherwise, the encoding may be explicitly set with an
+ * escape sequence.
+ */
+static inline pm_node_flags_t
+parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
+    // Don't set any flags on the Symbol if it hasn't been populated yet.
+    if (contents->source == NULL) {
+        return 0;
+    }
+
+    // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all symbols appearing in
+    // source are eligible for "downgrading" to US-ASCII.
+    if (ascii_only_p(contents)) {
+        return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING;
+    } else {
+        // A Symbol may optionally have its encoding explicitly set.
+        //
+        // NB: an explicitly set encoding is ignored by Ruby if the Symbol consists of only US ASCII code points.
+        if (parser->explicit_encoding != NULL) {
+            if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+                return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING;
+            } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+                return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING;
+            }
+        }
+    }
+    return 0;
+}
+
+/**
  * Allocate and initialize a new SymbolNode node with the given unescaped
  * string.
  */
@@ -5494,6 +5541,8 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
         .unescaped = *unescaped
     };
 
+    pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, unescaped));
+
     return node;
 }
 
@@ -5532,6 +5581,7 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) {
 
             assert((label.end - label.start) >= 0);
             pm_string_shared_init(&node->unescaped, label.start, label.end);
+            pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, &node->unescaped));
             break;
         }
         case PM_TOKEN_MISSING: {
@@ -5594,6 +5644,8 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const
         .unescaped = node->unescaped
     };
 
+    pm_node_flag_set((pm_node_t *)new_node, parse_symbol_encoding(parser, &node->unescaped));
+
     // We are explicitly _not_ using pm_node_destroy here because we don't want
     // to trash the unescaped string. We could instead copy the string if we
     // know that it is owned, but we're taking the fast path for now.
@@ -8115,7 +8167,6 @@ pm_token_buffer_push(pm_token_buffer_t *token_buffer, uint8_t byte) {
 /**
  * When we're about to return from lexing the current token and we know for sure
  * that we have found an escape sequence, this function is called to copy the
- *
  * contents of the token buffer into the current string on the parser so that it
  * can be attached to the correct node.
  */
@@ -8130,7 +8181,6 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
  * string. If we haven't pushed anything into the buffer, this means that we
  * never found an escape sequence, so we can directly reference the bounds of
  * the current string. Either way, at the return of this function it is expected
- *
  * that parser->current_string is established in such a way that it can be
  * attached to a node.
  */
@@ -8149,7 +8199,6 @@ pm_token_buffer_flush(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
  * point into the buffer because we're about to provide a string that has
  * different content than a direct slice of the source.
  *
- *
  * It is expected that the parser's current token end will be pointing at one
  * byte past the backslash that starts the escape sequence.
  */
@@ -12595,8 +12644,11 @@ PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int
 static inline pm_node_flags_t
 parse_unescaped_encoding(const pm_parser_t *parser) {
     if (parser->explicit_encoding != NULL) {
+        // If the there's an explicit encoding and it's using a UTF-8 escape sequence, then mark the string as UTF-8.
         if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
             return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
+        // If there's a non-UTF-8 escape sequence being used, then the string uses the source encoding, unless the source
+        // is marked as US-ASCII. In that case the string is forced as ASCII-8BIT in order to keep the string valid.
         } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
             return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
         }
@@ -12749,6 +12801,7 @@ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_sta
     parser_lex(parser);
 
     pm_string_shared_init(&symbol->unescaped, parser->previous.start, end);
+    pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
     return (pm_node_t *) symbol;
 }
 
@@ -12787,6 +12840,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
         pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
 
         pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
+        pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
+
         return (pm_node_t *) symbol;
     }
 
@@ -12872,6 +12927,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
     } else {
         content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end };
         pm_string_shared_init(&unescaped, content.start, content.end);
+
     }
 
     if (next_state != PM_LEX_STATE_NONE) {
@@ -12883,7 +12939,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s
     } else {
         expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC);
     }
-    return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+
+    pm_symbol_node_t *symbol_node = pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+    pm_node_flag_set((pm_node_t *)symbol_node, parse_symbol_encoding(parser, &symbol_node->unescaped));
+
+    return (pm_node_t *) symbol_node;
 }
 
 /**
@@ -12947,6 +13007,7 @@ parse_alias_argument(pm_parser_t *parser, bool first) {
             pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing);
 
             pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end);
+            pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped));
             return (pm_node_t *) symbol;
         }
         case PM_TOKEN_SYMBOL_BEGIN: {
author	Kevin Menard <kevin@nirvdrum.com>	2024-01-24 16:39:06 -0500
committer	git <svn-admin@ruby-lang.org>	2024-01-26 20:15:19 +0000
commit	2a509787cb8869301b614139218432aef9b68f9b (patch)
tree	93c7e278fd5124be8697581b1a53d78b923ffed7 /prism
parent	3d996e827f2ff74a1bb7e978d754cea7d957b9eb (diff)