[ruby/prism] Provide flags for changing encodings

https://github.com/ruby/prism/commit/e838eaff6f
author: Kevin Newton <kddnewton@gmail.com> 2023-12-04 12:51:22 -0500
committer: Kevin Newton <kddnewton@gmail.com> 2023-12-06 14:23:38 -0500
commit: 82f18baa21d0df59c30d8a6e60bf3e0991de1114 (patch)
tree: d861044ddaf3d334fee10325f15eab9887ae546b
parent: 9620ca678929f28dd8dab8e278e438a430a85022 (diff)
31 files changed, 455 insertions, 226 deletions
diff --git a/prism/config.yml b/prism/config.yml
index f7b6751eaa..381e5efcbc 100644
--- a/prism/config.yml
+++ b/prism/config.yml
@@ -346,6 +346,13 @@ flags:
       - name: VARIABLE_CALL
         comment: "a call that could have been a local variable"
     comment: Flags for call nodes.
+  - name: EncodingFlags
+    values:
+      - name: FORCED_UTF8_ENCODING
+        comment: "internal bytes forced the encoding to UTF-8"
+      - name: FORCED_BINARY_ENCODING
+        comment: "internal bytes forced the encoding to binary"
+    comment: Flags for nodes that have unescaped content.
   - name: IntegerBaseFlags
     values:
       - name: BINARY
@@ -388,6 +395,10 @@ flags:
     comment: Flags for regular expression and match last line nodes.
   - name: StringFlags
     values:
+      - name: FORCED_UTF8_ENCODING
+        comment: "internal bytes forced the encoding to UTF-8"
+      - name: FORCED_BINARY_ENCODING
+        comment: "internal bytes forced the encoding to binary"
       - name: FROZEN
         comment: "frozen by virtue of a `frozen_string_literal` comment"
     comment: Flags for string nodes.
@@ -2576,6 +2587,9 @@ nodes:
           ^^^^^^^^^^^^^^^^^^^^
   - name: XStringNode
     fields:
+      - name: flags
+        type: flags
+        kind: EncodingFlags
       - name: opening_loc
         type: location
       - name: content_loc
diff --git a/prism/defines.h b/prism/defines.h
index f89a0bed8e..c9715c4eb0 100644
--- a/prism/defines.h
+++ b/prism/defines.h
@@ -74,4 +74,21 @@
 #   define snprintf _snprintf
 #endif
 
+/**
+ * A simple utility macro to concatenate two tokens together, necessary when one
+ * of the tokens is itself a macro.
+ */
+#define PM_CONCATENATE(left, right) left ## right
+
+/**
+ * We want to be able to use static assertions, but they weren't standardized
+ * until C11. As such, we polyfill it here by making a hacky typedef that will
+ * fail to compile due to a negative array size if the condition is false.
+ */
+#if defined(_Static_assert)
+#   define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message)
+#else
+#   define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1]
+#endif
+
 #endif
diff --git a/prism/diagnostic.c b/prism/diagnostic.c
index f9fd95cb06..7cffce7c9f 100644
--- a/prism/diagnostic.c
+++ b/prism/diagnostic.c
@@ -185,6 +185,7 @@ static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
     [PM_ERR_LIST_W_UPPER_ELEMENT]               = "expected a string in a `%W` list",
     [PM_ERR_LIST_W_UPPER_TERM]                  = "expected a closing delimiter for the `%W` list",
     [PM_ERR_MALLOC_FAILED]                      = "failed to allocate memory",
+    [PM_ERR_MIXED_ENCODING]                     = "UTF-8 mixed within %s source",
     [PM_ERR_MODULE_IN_METHOD]                   = "unexpected module definition in a method definition",
     [PM_ERR_MODULE_NAME]                        = "expected a constant name after `module`",
     [PM_ERR_MODULE_TERM]                        = "expected an `end` to close the `module` statement",
diff --git a/prism/diagnostic.h b/prism/diagnostic.h
index fc408ccbd6..079d409147 100644
--- a/prism/diagnostic.h
+++ b/prism/diagnostic.h
@@ -177,6 +177,7 @@ typedef enum {
     PM_ERR_LIST_W_UPPER_ELEMENT,
     PM_ERR_LIST_W_UPPER_TERM,
     PM_ERR_MALLOC_FAILED,
+    PM_ERR_MIXED_ENCODING,
     PM_ERR_MODULE_IN_METHOD,
     PM_ERR_MODULE_NAME,
     PM_ERR_MODULE_TERM,
diff --git a/prism/encoding.c b/prism/encoding.c
index 4bf6b6a775..3493353b04 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -4212,9 +4212,9 @@ pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) {
 }
 
 /**
- * This is the definition of all of the encodings that we support.
+ * This is the table of all of the encodings that prisms supports.
  */
-static const pm_encoding_t pm_encodings[] = {
+const pm_encoding_t pm_encodings[] = {
     [PM_ENCODING_UTF_8] = {
         .name = "UTF-8",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4223,14 +4223,6 @@ static const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_utf_8_isupper_char,
         .multibyte = true
     },
-    [PM_ENCODING_ASCII] = {
-        .name = "US-ASCII",
-        .char_width = pm_encoding_ascii_char_width,
-        .alnum_char = pm_encoding_ascii_alnum_char,
-        .alpha_char = pm_encoding_ascii_alpha_char,
-        .isupper_char = pm_encoding_ascii_isupper_char,
-        .multibyte = false
-    },
     [PM_ENCODING_ASCII_8BIT] = {
         .name = "ASCII-8BIT",
         .char_width = pm_encoding_single_char_width,
@@ -4815,6 +4807,14 @@ static const pm_encoding_t pm_encodings[] = {
         .isupper_char = pm_encoding_tis_620_isupper_char,
         .multibyte = false
     },
+    [PM_ENCODING_US_ASCII] = {
+        .name = "US-ASCII",
+        .char_width = pm_encoding_ascii_char_width,
+        .alnum_char = pm_encoding_ascii_alnum_char,
+        .alpha_char = pm_encoding_ascii_alpha_char,
+        .isupper_char = pm_encoding_ascii_isupper_char,
+        .multibyte = false
+    },
     [PM_ENCODING_UTF8_MAC] = {
         .name = "UTF8-MAC",
         .char_width = pm_encoding_utf_8_char_width,
@@ -4938,11 +4938,6 @@ static const pm_encoding_t pm_encodings[] = {
 };
 
 /**
- * This is the default UTF-8 encoding. We need it to quickly create parsers.
- */
-const pm_encoding_t *pm_encoding_utf_8 = pm_encodings;
-
-/**
  * Parse the given name of an encoding and return a pointer to the corresponding
  * encoding struct if one can be found, otherwise return NULL.
  */
@@ -4961,7 +4956,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
         }
 
         // Otherwise we'll return the default UTF-8 encoding.
-        return pm_encoding_utf_8;
+        return PM_ENCODING_UTF_8_ENTRY;
     }
 
     // Next, we're going to loop through each of the encodings that we handle
@@ -4972,9 +4967,9 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
     if (width >= 3) {
         switch (*start) {
             case 'A': case 'a':
-                ENCODING1("ASCII", PM_ENCODING_ASCII);
+                ENCODING1("ASCII", PM_ENCODING_US_ASCII);
                 ENCODING1("ASCII-8BIT", PM_ENCODING_ASCII_8BIT);
-                ENCODING1("ANSI_X3.4-1968", PM_ENCODING_ASCII);
+                ENCODING1("ANSI_X3.4-1968", PM_ENCODING_US_ASCII);
                 break;
             case 'B': case 'b':
                 ENCODING1("BINARY", PM_ENCODING_ASCII_8BIT);
@@ -5109,7 +5104,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("TIS-620", PM_ENCODING_TIS_620);
                 break;
             case 'U': case 'u':
-                ENCODING1("US-ASCII", PM_ENCODING_ASCII);
+                ENCODING1("US-ASCII", PM_ENCODING_US_ASCII);
                 ENCODING2("UTF8-MAC", "UTF-8-HFS", PM_ENCODING_UTF8_MAC);
                 ENCODING1("UTF8-DoCoMo", PM_ENCODING_UTF8_DOCOMO);
                 ENCODING1("UTF8-KDDI", PM_ENCODING_UTF8_KDDI);
@@ -5129,7 +5124,7 @@ pm_encoding_find(const uint8_t *start, const uint8_t *end) {
                 ENCODING1("Windows-1258", PM_ENCODING_WINDOWS_1258);
                 break;
             case '6':
-                ENCODING1("646", PM_ENCODING_ASCII);
+                ENCODING1("646", PM_ENCODING_US_ASCII);
                 break;
         }
     }
diff --git a/prism/encoding.h b/prism/encoding.h
index 247db600ce..c286338160 100644
--- a/prism/encoding.h
+++ b/prism/encoding.h
@@ -125,7 +125,6 @@ extern const uint8_t pm_encoding_unicode_table[256];
  */
 typedef enum {
     PM_ENCODING_UTF_8 = 0,
-    PM_ENCODING_ASCII,
     PM_ENCODING_ASCII_8BIT,
     PM_ENCODING_BIG5,
     PM_ENCODING_BIG5_HKSCS,
@@ -199,6 +198,7 @@ typedef enum {
     PM_ENCODING_STATELESS_ISO_2022_JP,
     PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
     PM_ENCODING_TIS_620,
+    PM_ENCODING_US_ASCII,
     PM_ENCODING_UTF8_MAC,
     PM_ENCODING_UTF8_DOCOMO,
     PM_ENCODING_UTF8_KDDI,
@@ -213,13 +213,27 @@ typedef enum {
     PM_ENCODING_WINDOWS_1257,
     PM_ENCODING_WINDOWS_1258,
     PM_ENCODING_WINDOWS_31J,
-    PM_ENCODING_WINDOWS_874
+    PM_ENCODING_WINDOWS_874,
+    PM_ENCODING_MAXIMUM
 } pm_encoding_type_t;
 
 /**
- * This is the default UTF-8 encoding. We need it to quickly create parsers.
+ * This is the table of all of the encodings that prisms supports.
  */
-extern const pm_encoding_t *pm_encoding_utf_8;
+extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
+
+/**
+ * This is the default UTF-8 encoding. We need a reference to it to quickly
+ * create parsers.
+ */
+#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
+
+/**
+ * This is the US-ASCII encoding. We need a reference to it to be able to
+ * compare against it when a string is being created because it could possibly
+ * need to fall back to ASCII-8BIT.
+ */
+#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
 
 /**
  * Parse the given name of an encoding and return a pointer to the corresponding
diff --git a/prism/extension.c b/prism/extension.c
index f6f2b6b195..fb252de3fe 100644
--- a/prism/extension.c
+++ b/prism/extension.c
@@ -469,7 +469,7 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) {
 static void
 parse_lex_encoding_changed_callback(pm_parser_t *parser) {
     parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
-    parse_lex_data->encoding = rb_enc_find(parser->encoding.name);
+    parse_lex_data->encoding = rb_enc_find(parser->encoding->name);
 
     // Since the encoding changed, we need to go back and change the encoding of
     // the tokens that were already lexed. This is only going to end up being
@@ -599,7 +599,7 @@ parse_input(pm_string_t *input, const pm_options_t *options) {
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
 
     pm_node_t *node = pm_parse(&parser);
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
 
     VALUE source = pm_source_new(&parser, encoding);
     VALUE result_argv[] = {
@@ -693,7 +693,7 @@ parse_input_comments(pm_string_t *input, const pm_options_t *options) {
     pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options);
 
     pm_node_t *node = pm_parse(&parser);
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
 
     VALUE source = pm_source_new(&parser, encoding);
     VALUE comments = parser_comments(&parser, source);
@@ -872,7 +872,7 @@ static VALUE
 named_captures(VALUE self, VALUE source) {
     pm_string_list_t string_list = { 0 };
 
-    if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, pm_encoding_utf_8)) {
+    if (!pm_regexp_named_capture_group_names((const uint8_t *) RSTRING_PTR(source), RSTRING_LEN(source), &string_list, false, PM_ENCODING_UTF_8_ENTRY)) {
         pm_string_list_free(&string_list);
         return Qnil;
     }
@@ -962,7 +962,7 @@ inspect_node(VALUE self, VALUE source) {
 
     pm_prettyprint(&buffer, &parser, node);
 
-    rb_encoding *encoding = rb_enc_find(parser.encoding.name);
+    rb_encoding *encoding = rb_enc_find(parser.encoding->name);
     VALUE string = rb_enc_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer), encoding);
 
     pm_buffer_free(&buffer);
diff --git a/prism/parser.h b/prism/parser.h
index 98d8c0159b..2c58131b19 100644
--- a/prism/parser.h
+++ b/prism/parser.h
@@ -523,12 +523,6 @@ struct pm_parser {
         size_t index;
     } lex_modes;
 
-    /**
-     * The common_whitespace value from the most-recently-popped heredoc mode of the lexer, so we
-     * can dedent the heredoc after popping the lex mode.
-     */
-    size_t current_string_common_whitespace;
-
     /** The pointer to the start of the source. */
     const uint8_t *start;
 
@@ -581,7 +575,7 @@ struct pm_parser {
      * The encoding functions for the current file is attached to the parser as
      * it's parsing so that it can change with a magic comment.
      */
-    pm_encoding_t encoding;
+    const pm_encoding_t *encoding;
 
     /**
      * When the encoding that is being used to parse the source is changed by
@@ -637,6 +631,37 @@ struct pm_parser {
      */
     int32_t start_line;
 
+    /**
+     * When a string-like expression is being lexed, any byte or escape sequence
+     * that resolves to a value whose top bit is set (i.e., >= 0x80) will
+     * explicitly set the encoding to the same encoding as the source.
+     * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
+     * resolves to a value whose top bit is set, then the encoding will be
+     * explicitly set to UTF-8.
+     *
+     * The _next_ time this happens, if the encoding that is about to become the
+     * explicitly set encoding does not match the previously set explicit
+     * encoding, a mixed encoding error will be emitted.
+     *
+     * When the expression is finished being lexed, the explicit encoding
+     * controls the encoding of the expression. For the most part this means
+     * that the expression will either be encoded in the source encoding or
+     * UTF-8. This holds for all encodings except US-ASCII. If the source is
+     * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
+     * expression will be encoded as ASCII-8BIT.
+     *
+     * Note that if the expression is a list, different elements within the same
+     * list can have different encodings, so this will get reset between each
+     * element. Furthermore all of this only applies to lists that support
+     * interpolation, because otherwise escapes that could change the encoding
+     * are ignored.
+     *
+     * At first glance, it may make more sense for this to live on the lexer
+     * mode, but we need it here to communicate back to the parser for character
+     * literals that do not push a new lexer mode.
+     */
+    const pm_encoding_t *explicit_encoding;
+
     /** Whether or not we're at the beginning of a command. */
     bool command_start;
 
diff --git a/prism/prism.c b/prism/prism.c
index fee14e395f..3ad21f3334 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -275,6 +275,7 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) {
         breakpoints[index++] = incrementor;
     }
 
+    parser->explicit_encoding = NULL;
     return lex_mode_push(parser, lex_mode);
 }
 
@@ -356,6 +357,7 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed
         breakpoints[index++] = incrementor;
     }
 
+    parser->explicit_encoding = NULL;
     return lex_mode_push(parser, lex_mode);
 }
 
@@ -539,7 +541,7 @@ pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_
  * Append an error to the list of errors on the parser using the location of the
  * given token and a format string.
  */
-#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, token->start, token->end, diag_id, __VA_ARGS__)
+#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) pm_diagnostic_list_append_format(&parser->error_list, (token).start, (token).end, diag_id, __VA_ARGS__)
 
 /**
  * Append a warning to the list of warnings on the parser.
@@ -5714,6 +5716,7 @@ pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening,
     *node = (pm_x_string_node_t) {
         {
             .type = PM_X_STRING_NODE,
+            .flags = PM_STRING_FLAGS_FROZEN,
             .location = {
                 .start = opening->start,
                 .end = closing->end
@@ -5922,12 +5925,12 @@ static inline size_t
 char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
     if (parser->encoding_changed) {
         size_t width;
-        if ((width = parser->encoding.alpha_char(b, parser->end - b)) != 0) {
+        if ((width = parser->encoding->alpha_char(b, parser->end - b)) != 0) {
             return width;
         } else if (*b == '_') {
             return 1;
         } else if (*b >= 0x80) {
-            return parser->encoding.char_width(b, parser->end - b);
+            return parser->encoding->char_width(b, parser->end - b);
         } else {
             return 0;
         }
@@ -5960,12 +5963,12 @@ static inline size_t
 char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
     if (parser->encoding_changed) {
         size_t width;
-        if ((width = parser->encoding.alnum_char(b, parser->end - b)) != 0) {
+        if ((width = parser->encoding->alnum_char(b, parser->end - b)) != 0) {
             return width;
         } else if (*b == '_') {
             return 1;
         } else if (*b >= 0x80) {
-            return parser->encoding.char_width(b, parser->end - b);
+            return parser->encoding->char_width(b, parser->end - b);
         } else {
             return 0;
         }
@@ -6148,8 +6151,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
     const pm_encoding_t *encoding = pm_encoding_find(start, end);
 
     if (encoding != NULL) {
-        if (encoding != pm_encoding_utf_8) {
-            parser->encoding = *encoding;
+        if (encoding != PM_ENCODING_UTF_8_ENTRY) {
+            parser->encoding = encoding;
             parser->encoding_changed = true;
             if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
         }
@@ -6205,7 +6208,7 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) {
     }
 
     const uint8_t *value_start = cursor;
-    while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end);
+    while ((*cursor == '-' || *cursor == '_' || parser->encoding->alnum_char(cursor, 1)) && ++cursor < end);
 
     if (!parser_lex_magic_comment_encoding_value(parser, value_start, cursor)) {
         // If we were unable to parse the encoding value, then we've got an
@@ -6239,7 +6242,7 @@ pm_char_is_magic_comment_key_delimiter(const uint8_t b) {
  */
 static inline const uint8_t *
 parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) {
-    while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding)) != NULL) {
+    while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, parser->encoding)) != NULL) {
         if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') {
             return cursor;
         }
@@ -6329,7 +6332,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
         // underscores. We only need to do this if there _is_ a dash in the key.
         pm_string_t key;
         const size_t key_length = (size_t) (key_end - key_start);
-        const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, &parser->encoding);
+        const uint8_t *dash = pm_memchr(key_start, '-', (size_t) key_length, parser->encoding_changed, parser->encoding);
 
         if (dash == NULL) {
             pm_string_shared_init(&key, key_start, key_end);
@@ -6341,7 +6344,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
             memcpy(buffer, key_start, width);
             buffer[dash - key_start] = '_';
 
-            while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, &parser->encoding)) != NULL) {
+            while ((dash = pm_memchr(dash + 1, '-', (size_t) (key_end - dash - 1), parser->encoding_changed, parser->encoding)) != NULL) {
                 buffer[dash - key_start] = '_';
             }
 
@@ -7000,7 +7003,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
     }
 
     if (encoding_changed) {
-        return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
+        return parser->encoding->isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
     }
     return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
 }
@@ -7214,7 +7217,18 @@ escape_byte(uint8_t value, const uint8_t flags) {
  * Write a unicode codepoint to the given buffer.
  */
 static inline void
-escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *start, const uint8_t *end, uint32_t value) {
+escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, const uint8_t *start, const uint8_t *end, uint32_t value) {
+    // \u escape sequences in string-like structures implicitly change the
+    // encoding to UTF-8 if they are >= 0x80 or if they are used in a character
+    // literal.
+    if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) {
+        if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) {
+            PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name);
+        }
+
+        parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+    }
+
     if (value <= 0x7F) { // 0xxxxxxx
         pm_buffer_append_byte(buffer, (uint8_t) value);
     } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
@@ -7238,6 +7252,23 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st
 }
 
 /**
+ * When you're writing a byte to the unescape buffer, if the byte is non-ASCII
+ * (i.e., the top bit is set) then it locks in the encoding.
+ */
+static inline void
+escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) {
+    if (byte >= 0x80) {
+        if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+        }
+
+        parser->explicit_encoding = parser->encoding;
+    }
+
+    pm_buffer_append_byte(buffer, byte);
+}
+
+/**
  * The regular expression engine doesn't support the same escape sequences as
  * Ruby does. So first we have to read the escape sequence, and then we have to
  * format it like the regular expression engine expects it. For example, in Ruby
@@ -7253,7 +7284,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t *st
  * source so that the regular expression engine will perform its own unescaping.
  */
 static inline void
-escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
+escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
     if (flags & PM_ESCAPE_FLAG_REGEXP) {
         pm_buffer_append_bytes(buffer, (const uint8_t *) "\\x", 2);
 
@@ -7272,7 +7303,7 @@ escape_write_byte(pm_buffer_t *buffer, uint8_t flags, uint8_t byte) {
             pm_buffer_append_byte(buffer, (uint8_t) (byte2 + '0'));
         }
     } else {
-        pm_buffer_append_byte(buffer, byte);
+        escape_write_byte_encoded(parser, buffer, byte);
     }
 }
 
@@ -7351,7 +7382,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                 }
             }
 
-            pm_buffer_append_byte(buffer, value);
+            escape_write_byte_encoded(parser, buffer, value);
             return;
         }
         case 'x': {
@@ -7373,7 +7404,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                 if (flags & PM_ESCAPE_FLAG_REGEXP) {
                     pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end - start));
                 } else {
-                    pm_buffer_append_byte(buffer, value);
+                    escape_write_byte_encoded(parser, buffer, value);
                 }
             } else {
                 pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL);
@@ -7397,7 +7428,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                 if (flags & PM_ESCAPE_FLAG_REGEXP) {
                     pm_buffer_append_bytes(buffer, start, (size_t) (parser->current.end + 4 - start));
                 } else {
-                    escape_write_unicode(parser, buffer, start, parser->current.end + 4, value);
+                    escape_write_unicode(parser, buffer, flags, start, parser->current.end + 4, value);
                 }
 
                 parser->current.end += 4;
@@ -7431,13 +7462,14 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
 
                     if (!(flags & PM_ESCAPE_FLAG_REGEXP)) {
                         uint32_t value = escape_unicode(unicode_start, hexadecimal_length);
-                        escape_write_unicode(parser, buffer, unicode_start, parser->current.end, value);
+                        escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value);
                     }
 
                     parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
                 }
 
-                // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
+                // ?\u{nnnn} character literal should contain only one codepoint
+                // and cannot be like ?\u{nnnn mmmm}.
                 if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) {
                     pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
                 }
@@ -7468,7 +7500,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             switch (peeked) {
                 case '?': {
                     parser->current.end++;
-                    escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
+                    escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
                     return;
                 }
                 case '\\':
@@ -7486,7 +7518,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     }
 
                     parser->current.end++;
-                    escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                     return;
                 }
             }
@@ -7508,7 +7540,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             switch (peeked) {
                 case '?': {
                     parser->current.end++;
-                    escape_write_byte(buffer, flags, escape_byte(0x7f, flags));
+                    escape_write_byte(parser, buffer, flags, escape_byte(0x7f, flags));
                     return;
                 }
                 case '\\':
@@ -7526,7 +7558,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
                     }
 
                     parser->current.end++;
-                    escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
+                    escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL));
                     return;
                 }
             }
@@ -7561,7 +7593,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
             }
 
             parser->current.end++;
-            escape_write_byte(buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
+            escape_write_byte(parser, buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META));
             return;
         }
         case '\r': {
@@ -7574,7 +7606,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t flags) {
         /* fallthrough */
         default: {
             if (parser->current.end < parser->end) {
-                pm_buffer_append_byte(buffer, *parser->current.end++);
+                escape_write_byte_encoded(parser, buffer, *parser->current.end++);
             }
             return;
         }
@@ -7637,13 +7669,12 @@ lex_question_mark(pm_parser_t *parser) {
 
         return PM_TOKEN_CHARACTER_LITERAL;
     } else {
-        size_t encoding_width = parser->encoding.char_width(parser->current.end, parser->end - parser->current.end);
+        size_t encoding_width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end);
 
-        // Ternary operators can have a ? immediately followed by an identifier which starts with
-        // an underscore. We check for this case
+        // Ternary operators can have a ? immediately followed by an identifier
+        // which starts with an underscore. We check for this case here.
         if (
-            !(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) ||
-              peek(parser) == '_') ||
+            !(parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end) || peek(parser) == '_') ||
             (
                 (parser->current.end + encoding_width >= parser->end) ||
                 !char_is_identifier(parser, parser->current.end + encoding_width)
@@ -8491,6 +8522,7 @@ parser_lex(pm_parser_t *parser) {
                                     // TODO: handle unterminated heredoc
                                 }
 
+                                parser->explicit_encoding = NULL;
                                 lex_mode_push(parser, (pm_lex_mode_t) {
                                     .mode = PM_LEX_HEREDOC,
                                     .as.heredoc = {
@@ -8897,7 +8929,7 @@ parser_lex(pm_parser_t *parser) {
                         (lex_state_p(parser, PM_LEX_STATE_FITEM) && (peek(parser) == 's')) ||
                         lex_state_spcarg_p(parser, space_seen)
                     ) {
-                        if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
+                        if (!parser->encoding->alnum_char(parser->current.end, parser->end - parser->current.end)) {
                             if (*parser->current.end >= 0x80) {
                                 pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
                             }
@@ -8920,7 +8952,7 @@ parser_lex(pm_parser_t *parser) {
                         // Delimiters for %-literals cannot be alphanumeric. We
                         // validate that here.
                         uint8_t delimiter = peek_offset(parser, 1);
-                        if (delimiter >= 0x80 || parser->encoding.alnum_char(&delimiter, 1)) {
+                        if (delimiter >= 0x80 || parser->encoding->alnum_char(&delimiter, 1)) {
                             pm_parser_err_current(parser, PM_ERR_INVALID_PERCENT);
                             goto lex_next_token;
                         }
@@ -9766,7 +9798,6 @@ parser_lex(pm_parser_t *parser) {
             if (current_token_starts_line(parser)) {
                 const uint8_t *start = parser->current.start;
                 if (start + ident_length <= parser->end) {
-                    bool at_end = false;
                     const uint8_t *newline = next_newline(start, parser->end - start);
                     const uint8_t *ident_end = newline;
                     const uint8_t *terminator_end = newline;
@@ -9774,7 +9805,6 @@ parser_lex(pm_parser_t *parser) {
                     if (newline == NULL) {
                         terminator_end = parser->end;
                         ident_end = parser->end;
-                        at_end = true;
                     } else {
                         terminator_end++;
                         if (newline[-1] == '\r') {
@@ -9801,6 +9831,7 @@ parser_lex(pm_parser_t *parser) {
                         if (newline != NULL) {
                             pm_newline_list_append(&parser->newline_list, newline);
                         }
+
                         parser->current.end = terminator_end;
                         if (*lex_mode->as.heredoc.next_start == '\\') {
                             parser->next_start = NULL;
@@ -9809,14 +9840,11 @@ parser_lex(pm_parser_t *parser) {
                             parser->heredoc_end = parser->current.end;
                         }
 
-                        parser->current_string_common_whitespace = parser->lex_modes.current->as.heredoc.common_whitespace;
-                        lex_mode_pop(parser);
-                        if (!at_end) {
-                            lex_state_set(parser, PM_LEX_STATE_END);
-                        }
+                        lex_state_set(parser, PM_LEX_STATE_END);
                         LEX(PM_TOKEN_HEREDOC_END);
                     }
                 }
+
                 size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
                 if (
                     lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
@@ -10588,7 +10616,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target) {
                     return target;
                 }
 
-                if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
+                if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
                     parse_write_name(parser, &call->name);
                     return (pm_node_t *) call;
                 }
@@ -10735,7 +10763,7 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod
                     return target;
                 }
 
-                if (*call->message_loc.start == '_' || parser->encoding.alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
+                if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) {
                     // When we get here, we have a method call, because it was
                     // previously marked as a method call but now we have an =. This
                     // looks like:
@@ -10970,7 +10998,7 @@ parse_assocs(pm_parser_t *parser, pm_node_t *node) {
                 if (token_begins_expression_p(parser->current.type)) {
                     value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL);
                 } else {
-                    if (parser->encoding.isupper_char(label.start, (label.end - 1) - label.start)) {
+                    if (parser->encoding->isupper_char(label.start, (label.end - 1) - label.start)) {
                         pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 };
                         value = (pm_node_t *) pm_constant_read_node_create(parser, &constant);
                     } else {
@@ -12239,6 +12267,26 @@ parse_conditional(pm_parser_t *parser, pm_context_t context) {
     case PM_INSTANCE_VARIABLE_READ_NODE: case PM_MULTI_TARGET_NODE: case PM_BACK_REFERENCE_READ_NODE: \
     case PM_NUMBERED_REFERENCE_READ_NODE
 
+// Assert here that the flags are the same so that we can safely switch the type
+// of the node without having to move the flags.
+PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int) PM_ENCODING_FLAGS_FORCED_UTF8_ENCODING), "Expected the flags to match.");
+
+/**
+ * If the encoding was explicitly set through the lexing process, then we need
+ * to potentially mark the string's flags to indicate how to encode it.
+ */
+static inline pm_node_flags_t
+parse_unescaped_encoding(const pm_parser_t *parser) {
+    if (parser->explicit_encoding != NULL) {
+        if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            return PM_STRING_FLAGS_FORCED_UTF8_ENCODING;
+        } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            return PM_STRING_FLAGS_FORCED_BINARY_ENCODING;
+        }
+    }
+    return 0;
+}
+
 /**
  * Parse a node that is part of a string. If the subsequent tokens cannot be
  * parsed as a string part, then NULL is returned.
@@ -12255,7 +12303,9 @@ parse_string_part(pm_parser_t *parser) {
         case PM_TOKEN_STRING_CONTENT: {
             pm_token_t opening = not_provided(parser);
             pm_token_t closing = not_provided(parser);
+
             pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
+            node->flags |= parse_unescaped_encoding(parser);
 
             parser_lex(parser);
             return node;
@@ -13459,8 +13509,9 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
 
         // Here we have found a string literal. We'll parse it and add it to
         // the list of strings.
-        assert(parser->lex_modes.current->mode == PM_LEX_STRING);
-        bool lex_interpolation = parser->lex_modes.current->as.string.interpolation;
+        const pm_lex_mode_t *lex_mode = parser->lex_modes.current;
+        assert(lex_mode->mode == PM_LEX_STRING);
+        bool lex_interpolation = lex_mode->as.string.interpolation;
 
         pm_token_t opening = parser->current;
         parser_lex(parser);
@@ -13544,6 +13595,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
 
             if (match1(parser, PM_TOKEN_STRING_END)) {
                 node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
+                node->flags |= parse_unescaped_encoding(parser);
                 parser_lex(parser);
             } else if (accept1(parser, PM_TOKEN_LABEL_END)) {
                 node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
@@ -13555,6 +13607,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) {
                 pm_token_t string_closing = not_provided(parser);
 
                 pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped);
+                part->flags |= parse_unescaped_encoding(parser);
                 pm_node_list_append(&parts, part);
 
                 while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) {
@@ -13888,6 +13941,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
 
             pm_token_t closing = not_provided(parser);
             pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing);
+            node->flags |= parse_unescaped_encoding(parser);
 
             // Characters can be followed by strings in which case they are
             // automatically concatenated.
@@ -14074,7 +14128,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             if (match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) {
                 // If we get here, then we have an empty heredoc. We'll create
                 // an empty content token and return an empty string node.
-                lex_state_set(parser, PM_LEX_STATE_END);
+                lex_mode_pop(parser);
                 expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
                 pm_token_t content = parse_strings_empty_content(parser->previous.start);
 
@@ -14095,6 +14149,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                 // content and we're at the end of the heredoc, so we can return
                 // just a string node with the heredoc opening and closing as
                 // its opening and closing.
+                part->flags |= parse_unescaped_encoding(parser);
                 pm_string_node_t *cast = (pm_string_node_t *) part;
 
                 cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening);
@@ -14106,13 +14161,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                     cast->base.type = PM_X_STRING_NODE;
                 }
 
-                size_t common_whitespace = parser->current_string_common_whitespace;
+                size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
                 if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
                     parse_heredoc_dedent_string(&cast->unescaped, common_whitespace);
                 }
 
                 node = (pm_node_t *) cast;
-                lex_state_set(parser, PM_LEX_STATE_END);
+                lex_mode_pop(parser);
                 expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
             } else {
                 // If we get here, then we have multiple parts in the heredoc,
@@ -14127,13 +14182,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                     }
                 }
 
+                size_t common_whitespace = lex_mode->as.heredoc.common_whitespace;
+
                 // Now that we have all of the parts, create the correct type of
                 // interpolated node.
                 if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
                     pm_interpolated_x_string_node_t *cast = pm_interpolated_xstring_node_create(parser, &opening, &opening);
                     cast->parts = parts;
 
-                    lex_state_set(parser, PM_LEX_STATE_END);
+                    lex_mode_pop(parser);
                     expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
 
                     pm_interpolated_xstring_node_closing_set(cast, &parser->previous);
@@ -14142,7 +14199,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                 } else {
                     pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening);
 
-                    lex_state_set(parser, PM_LEX_STATE_END);
+                    lex_mode_pop(parser);
                     expect1(parser, PM_TOKEN_HEREDOC_END, PM_ERR_HEREDOC_TERM);
 
                     pm_interpolated_string_node_closing_set(cast, &parser->previous);
@@ -14152,7 +14209,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
 
                 // If this is a heredoc that is indented with a ~, then we need
                 // to dedent each line by the common leading whitespace.
-                size_t common_whitespace = parser->current_string_common_whitespace;
                 if (indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) {
                     pm_node_list_t *nodes;
                     if (quote == PM_HEREDOC_QUOTE_BACKTICK) {
@@ -15409,8 +15465,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             } else {
                 expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM);
             }
-            pm_array_node_close_set(array, &closing);
 
+            pm_array_node_close_set(array, &closing);
             return (pm_node_t *) array;
         }
         case PM_TOKEN_PERCENT_UPPER_W: {
@@ -15418,19 +15474,24 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             pm_token_t opening = parser->previous;
             pm_array_node_t *array = pm_array_node_create(parser, &opening);
 
-            // This is the current node that we are parsing that will be added to the
-            // list of elements.
+            // This is the current node that we are parsing that will be added
+            // to the list of elements.
             pm_node_t *current = NULL;
 
             while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) {
                 switch (parser->current.type) {
                     case PM_TOKEN_WORDS_SEP: {
+                        // Reset the explicit encoding if we hit a separator
+                        // since each element can have its own encoding.
+                        parser->explicit_encoding = NULL;
+
                         if (current == NULL) {
-                            // If we hit a separator before we have any content, then we don't
-                            // need to do anything.
+                            // If we hit a separator before we have any content,
+                            // then we don't need to do anything.
                         } else {
-                            // If we hit a separator after we've hit content, then we need to
-                            // append that content to the list and reset the current node.
+                            // If we hit a separator after we've hit content,
+                            // then we need to append that content to the list
+                            // and reset the current node.
                             pm_array_node_elements_append(array, current);
                             current = NULL;
                         }
@@ -15443,22 +15504,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                         pm_token_t closing = not_provided(parser);
 
                         pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
+                        string->flags |= parse_unescaped_encoding(parser);
                         parser_lex(parser);
 
                         if (current == NULL) {
-                            // If we hit content and the current node is NULL, then this is
-                            // the first string content we've seen. In that case we're going
-                            // to create a new string node and set that to the current.
+                            // If we hit content and the current node is NULL,
+                            // then this is the first string content we've seen.
+                            // In that case we're going to create a new string
+                            // node and set that to the current.
                             current = string;
                         } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
-                            // If we hit string content and the current node is an
-                            // interpolated string, then we need to append the string content
-                            // to the list of child nodes.
+                            // If we hit string content and the current node is
+                            // an interpolated string, then we need to append
+                            // the string content to the list of child nodes.
                             pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string);
                         } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
-                            // If we hit string content and the current node is a string node,
-                            // then we need to convert the current node into an interpolated
-                            // string and add the string content to the list of child nodes.
+                            // If we hit string content and the current node is
+                            // a string node, then we need to convert the
+                            // current node into an interpolated string and add
+                            // the string content to the list of child nodes.
                             pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
                             pm_interpolated_string_node_append(interpolated, current);
                             pm_interpolated_string_node_append(interpolated, string);
@@ -15471,24 +15535,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                     }
                     case PM_TOKEN_EMBVAR: {
                         if (current == NULL) {
-                            // If we hit an embedded variable and the current node is NULL,
-                            // then this is the start of a new string. We'll set the current
-                            // node to a new interpolated string.
+                            // If we hit an embedded variable and the current
+                            // node is NULL, then this is the start of a new
+                            // string. We'll set the current node to a new
+                            // interpolated string.
                             pm_token_t opening = not_provided(parser);
                             pm_token_t closing = not_provided(parser);
                             current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
                         } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
-                            // If we hit an embedded variable and the current node is a string
-                            // node, then we'll convert the current into an interpolated
-                            // string and add the string node to the list of parts.
+                            // If we hit an embedded variable and the current
+                            // node is a string node, then we'll convert the
+                            // current into an interpolated string and add the
+                            // string node to the list of parts.
                             pm_token_t opening = not_provided(parser);
                             pm_token_t closing = not_provided(parser);
                             pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
                             pm_interpolated_string_node_append(interpolated, current);
                             current = (pm_node_t *) interpolated;
                         } else {
-                            // If we hit an embedded variable and the current node is an
-                            // interpolated string, then we'll just add the embedded variable.
+                            // If we hit an embedded variable and the current
+                            // node is an interpolated string, then we'll just
+                            // add the embedded variable.
                         }
 
                         pm_node_t *part = parse_string_part(parser);
@@ -15497,25 +15564,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                     }
                     case PM_TOKEN_EMBEXPR_BEGIN: {
                         if (current == NULL) {
-                            // If we hit an embedded expression and the current node is NULL,
-                            // then this is the start of a new string. We'll set the current
-                            // node to a new interpolated string.
+                            // If we hit an embedded expression and the current
+                            // node is NULL, then this is the start of a new
+                            // string. We'll set the current node to a new
+                            // interpolated string.
                             pm_token_t opening = not_provided(parser);
                             pm_token_t closing = not_provided(parser);
                             current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
                         } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) {
-                            // If we hit an embedded expression and the current node is a
-                            // string node, then we'll convert the current into an
-                            // interpolated string and add the string node to the list of
-                            // parts.
+                            // If we hit an embedded expression and the current
+                            // node is a string node, then we'll convert the
+                            // current into an interpolated string and add the
+                            // string node to the list of parts.
                             pm_token_t opening = not_provided(parser);
                             pm_token_t closing = not_provided(parser);
                             pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing);
                             pm_interpolated_string_node_append(interpolated, current);
                             current = (pm_node_t *) interpolated;
                         } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) {
-                            // If we hit an embedded expression and the current node is an
-                            // interpolated string, then we'll just continue on.
+                            // If we hit an embedded expression and the current
+                            // node is an interpolated string, then we'll just
+                            // continue on.
                         } else {
                             assert(false && "unreachable");
                         }
@@ -15543,8 +15612,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
             } else {
                 expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM);
             }
-            pm_array_node_close_set(array, &closing);
 
+            pm_array_node_close_set(array, &closing);
             return (pm_node_t *) array;
         }
         case PM_TOKEN_REGEXP_BEGIN: {
@@ -15652,8 +15721,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
                 pm_token_t content = parser->current;
                 parser_lex(parser);
 
-                if (accept1(parser, PM_TOKEN_STRING_END)) {
-                    return (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
+                if (match1(parser, PM_TOKEN_STRING_END)) {
+                    pm_node_t *node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped);
+                    node->flags |= parse_unescaped_encoding(parser);
+                    parser_lex(parser);
+                    return node;
                 }
 
                 // If we get here, then we have interpolation so we'll need to
@@ -15662,7 +15734,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
 
                 pm_token_t opening = not_provided(parser);
                 pm_token_t closing = not_provided(parser);
+
                 pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
+                part->flags |= parse_unescaped_encoding(parser);
 
                 pm_interpolated_xstring_node_append(node, part);
             } else {
@@ -15986,7 +16060,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
     pm_string_list_t named_captures = { 0 };
     pm_node_t *result;
 
-    if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, &parser->encoding) && (named_captures.length > 0)) {
+    if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, parser->encoding) && (named_captures.length > 0)) {
         // Since we should not create a MatchWriteNode when all capture names
         // are invalid, creating a MatchWriteNode is delayed here.
         pm_match_write_node_t *match = NULL;
@@ -17004,7 +17078,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
         .error_list = { 0 },
         .current_scope = NULL,
         .current_context = NULL,
-        .encoding = *pm_encoding_utf_8,
+        .encoding = PM_ENCODING_UTF_8_ENTRY,
         .encoding_changed_callback = NULL,
         .encoding_comment_start = source,
         .lex_callback = NULL,
@@ -17014,6 +17088,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
         .integer_base = 0,
         .current_string = PM_STRING_EMPTY,
         .start_line = 1,
+        .explicit_encoding = NULL,
         .command_start = true,
         .recovering = false,
         .encoding_changed = false,
@@ -17240,7 +17315,7 @@ pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t s
 
     pm_node_t *node = pm_parse(&parser);
     pm_serialize_header(buffer);
-    pm_serialize_encoding(&parser.encoding, buffer);
+    pm_serialize_encoding(parser.encoding, buffer);
     pm_buffer_append_varsint(buffer, parser.start_line);
     pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
 
diff --git a/prism/prism.h b/prism/prism.h
index 590cd74016..40ba5e554d 100644
--- a/prism/prism.h
+++ b/prism/prism.h
@@ -91,7 +91,7 @@ void pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t
  * @param encoding The encoding to serialize.
  * @param buffer The buffer to serialize to.
  */
-void pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer);
+void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer);
 
 /**
  * Serialize the encoding, metadata, nodes, and constant pool.
diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb
index 92fe9188f9..e9cdd1e82c 100644
--- a/prism/templates/src/serialize.c.erb
+++ b/prism/templates/src/serialize.c.erb
@@ -206,7 +206,7 @@ pm_serialize_diagnostic_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *
  * Serialize the name of the encoding to the buffer.
  */
 void
-pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) {
+pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer) {
     size_t encoding_length = strlen(encoding->name);
     pm_buffer_append_varuint(buffer, pm_sizet_to_u32(encoding_length));
     pm_buffer_append_string(buffer, encoding->name, encoding_length);
@@ -218,7 +218,7 @@ pm_serialize_encoding(pm_encoding_t *encoding, pm_buffer_t *buffer) {
  */
 void
 pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) {
-    pm_serialize_encoding(&parser->encoding, buffer);
+    pm_serialize_encoding(parser->encoding, buffer);
     pm_buffer_append_varsint(buffer, parser->start_line);
 <%- unless Prism::SERIALIZE_ONLY_SEMANTICS_FIELDS -%>
     pm_serialize_comment_list(parser, &parser->comment_list, buffer);
@@ -317,7 +317,7 @@ pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const
     // Append 0 to mark end of tokens.
     pm_buffer_append_byte(buffer, 0);
 
-    pm_serialize_encoding(&parser.encoding, buffer);
+    pm_serialize_encoding(parser.encoding, buffer);
     pm_buffer_append_varsint(buffer, parser.start_line);
     pm_serialize_comment_list(&parser, &parser.comment_list, buffer);
     pm_serialize_magic_comment_list(&parser, &parser.magic_comment_list, buffer);
diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c
index ce1f36910b..115eba1fd2 100644
--- a/prism/util/pm_strpbrk.c
+++ b/prism/util/pm_strpbrk.c
@@ -4,7 +4,7 @@
  * This is the slow path that does care about the encoding.
  */
 static inline const uint8_t *
-pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
+pm_strpbrk_multi_byte(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum) {
     size_t index = 0;
 
     while (index < maximum) {
@@ -12,7 +12,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
             return source + index;
         }
 
-        size_t width = parser->encoding.char_width(source + index, (ptrdiff_t) (maximum - index));
+        size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
         if (width == 0) {
             return NULL;
         }
@@ -61,10 +61,10 @@ pm_strpbrk_single_byte(const uint8_t *source, const uint8_t *charset, size_t max
  * need to take a slower path and iterate one multi-byte character at a time.
  */
 const uint8_t *
-pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
+pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length) {
     if (length <= 0) {
         return NULL;
-    } else if (parser->encoding_changed && parser->encoding.multibyte) {
+    } else if (parser->encoding_changed && parser->encoding->multibyte) {
         return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length);
     } else {
         return pm_strpbrk_single_byte(source, charset, (size_t) length);
diff --git a/prism/util/pm_strpbrk.h b/prism/util/pm_strpbrk.h
index 61a443e51a..c1cf0d54db 100644
--- a/prism/util/pm_strpbrk.h
+++ b/prism/util/pm_strpbrk.h
@@ -32,12 +32,12 @@
  * need to take a slower path and iterate one multi-byte character at a time.
  *
  * @param parser The parser.
- * @param source The source string.
+ * @param source The source to search.
  * @param charset The charset to search for.
- * @param length The maximum length to search.
+ * @param length The maximum number of bytes to search.
  * @return A pointer to the first character in the source string that is in the
  *     charset, or NULL if no such character exists.
  */
-const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
+const uint8_t * pm_strpbrk(const pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length);
 
 #endif
diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb
index 94ba3a6c2a..e4678c6f82 100644
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@@ -7,90 +7,16 @@ require_relative "test_helper"
 module Prism
   class EncodingTest < TestCase
     codepoints_1byte = 0...0x100
-    codepoints_2bytes = 0...0x10000
-
     encodings = {
-      Encoding::ASCII =>         codepoints_1byte,
-      Encoding::ASCII_8BIT =>    codepoints_1byte,
-      Encoding::CP850 =>         codepoints_1byte,
-      Encoding::CP852 =>         codepoints_1byte,
-      Encoding::CP855 =>         codepoints_1byte,
-      Encoding::GB1988 =>        codepoints_1byte,
-      Encoding::IBM437 =>        codepoints_1byte,
-      Encoding::IBM720 =>        codepoints_1byte,
-      Encoding::IBM737 =>        codepoints_1byte,
-      Encoding::IBM775 =>        codepoints_1byte,
-      Encoding::IBM852 =>        codepoints_1byte,
-      Encoding::IBM855 =>        codepoints_1byte,
-      Encoding::IBM857 =>        codepoints_1byte,
-      Encoding::IBM860 =>        codepoints_1byte,
-      Encoding::IBM861 =>        codepoints_1byte,
-      Encoding::IBM862 =>        codepoints_1byte,
-      Encoding::IBM863 =>        codepoints_1byte,
-      Encoding::IBM864 =>        codepoints_1byte,
-      Encoding::IBM865 =>        codepoints_1byte,
-      Encoding::IBM866 =>        codepoints_1byte,
-      Encoding::IBM869 =>        codepoints_1byte,
-      Encoding::ISO_8859_1 =>    codepoints_1byte,
-      Encoding::ISO_8859_2 =>    codepoints_1byte,
-      Encoding::ISO_8859_3 =>    codepoints_1byte,
-      Encoding::ISO_8859_4 =>    codepoints_1byte,
-      Encoding::ISO_8859_5 =>    codepoints_1byte,
-      Encoding::ISO_8859_6 =>    codepoints_1byte,
-      Encoding::ISO_8859_7 =>    codepoints_1byte,
-      Encoding::ISO_8859_8 =>    codepoints_1byte,
-      Encoding::ISO_8859_9 =>    codepoints_1byte,
-      Encoding::ISO_8859_10 =>   codepoints_1byte,
-      Encoding::ISO_8859_11 =>   codepoints_1byte,
-      Encoding::ISO_8859_13 =>   codepoints_1byte,
-      Encoding::ISO_8859_14 =>   codepoints_1byte,
-      Encoding::ISO_8859_15 =>   codepoints_1byte,
-      Encoding::ISO_8859_16 =>   codepoints_1byte,
-      Encoding::KOI8_R =>        codepoints_1byte,
-      Encoding::KOI8_U =>        codepoints_1byte,
-      Encoding::MACCENTEURO =>   codepoints_1byte,
-      Encoding::MACCROATIAN =>   codepoints_1byte,
-      Encoding::MACCYRILLIC =>   codepoints_1byte,
-      Encoding::MACGREEK =>      codepoints_1byte,
-      Encoding::MACICELAND =>    codepoints_1byte,
-      Encoding::MACROMAN =>      codepoints_1byte,
-      Encoding::MACROMANIA =>    codepoints_1byte,
-      Encoding::MACTHAI =>       codepoints_1byte,
-      Encoding::MACTURKISH =>    codepoints_1byte,
-      Encoding::MACUKRAINE =>    codepoints_1byte,
-      Encoding::TIS_620 =>       codepoints_1byte,
-      Encoding::Windows_1250 =>  codepoints_1byte,
-      Encoding::Windows_1251 =>  codepoints_1byte,
-      Encoding::Windows_1252 =>  codepoints_1byte,
-      Encoding::Windows_1253 =>  codepoints_1byte,
-      Encoding::Windows_1254 =>  codepoints_1byte,
-      Encoding::Windows_1255 =>  codepoints_1byte,
-      Encoding::Windows_1256 =>  codepoints_1byte,
-      Encoding::Windows_1257 =>  codepoints_1byte,
-      Encoding::Windows_1258 =>  codepoints_1byte,
-      Encoding::Windows_874 =>   codepoints_1byte,
-      Encoding::Big5 =>          codepoints_2bytes,
-      Encoding::Big5_HKSCS =>    codepoints_2bytes,
-      Encoding::Big5_UAO =>      codepoints_2bytes,
-      Encoding::CP949 =>         codepoints_2bytes,
-      Encoding::CP950 =>         codepoints_2bytes,
-      Encoding::CP951 =>         codepoints_2bytes,
-      Encoding::EUC_KR =>        codepoints_2bytes,
-      Encoding::GBK =>           codepoints_2bytes,
-      Encoding::GB12345 =>       codepoints_2bytes,
-      Encoding::GB2312 =>        codepoints_2bytes,
-      Encoding::MACJAPANESE =>   codepoints_2bytes,
-      Encoding::Shift_JIS =>     codepoints_2bytes,
-      Encoding::SJIS_DoCoMo =>   codepoints_2bytes,
-      Encoding::SJIS_KDDI =>     codepoints_2bytes,
-      Encoding::SJIS_SoftBank => codepoints_2bytes,
-      Encoding::Windows_31J =>   codepoints_2bytes
+      Encoding::ASCII_8BIT =>   codepoints_1byte,
+      Encoding::US_ASCII =>     codepoints_1byte,
+      Encoding::Windows_1253 => codepoints_1byte
     }
 
-    # By default we don't test every codepoint in these encodings because they
-    # are 3 and 4 byte representations so it can drastically slow down the test
-    # suite.
+    # By default we don't test every codepoint in these encodings because it
+    # takes a very long time.
     if ENV["PRISM_TEST_ALL_ENCODINGS"]
+      codepoints_2bytes = 0...0x10000
       codepoints_unicode = (0...0x110000)
 
       codepoints_eucjp = [
@@ -118,6 +44,78 @@ module Prism
       ]
 
       encodings.merge!(
+        Encoding::CP850 =>                      codepoints_1byte,
+        Encoding::CP852 =>                      codepoints_1byte,
+        Encoding::CP855 =>                      codepoints_1byte,
+        Encoding::GB1988 =>                     codepoints_1byte,
+        Encoding::IBM437 =>                     codepoints_1byte,
+        Encoding::IBM720 =>                     codepoints_1byte,
+        Encoding::IBM737 =>                     codepoints_1byte,
+        Encoding::IBM775 =>                     codepoints_1byte,
+        Encoding::IBM852 =>                     codepoints_1byte,
+        Encoding::IBM855 =>                     codepoints_1byte,
+        Encoding::IBM857 =>                     codepoints_1byte,
+        Encoding::IBM860 =>                     codepoints_1byte,
+        Encoding::IBM861 =>                     codepoints_1byte,
+        Encoding::IBM862 =>                     codepoints_1byte,
+        Encoding::IBM863 =>                     codepoints_1byte,
+        Encoding::IBM864 =>                     codepoints_1byte,
+        Encoding::IBM865 =>                     codepoints_1byte,
+        Encoding::IBM866 =>                     codepoints_1byte,
+        Encoding::IBM869 =>                     codepoints_1byte,
+        Encoding::ISO_8859_1 =>                 codepoints_1byte,
+        Encoding::ISO_8859_2 =>                 codepoints_1byte,
+        Encoding::ISO_8859_3 =>                 codepoints_1byte,
+        Encoding::ISO_8859_4 =>                 codepoints_1byte,
+        Encoding::ISO_8859_5 =>                 codepoints_1byte,
+        Encoding::ISO_8859_6 =>                 codepoints_1byte,
+        Encoding::ISO_8859_7 =>                 codepoints_1byte,
+        Encoding::ISO_8859_8 =>                 codepoints_1byte,
+        Encoding::ISO_8859_9 =>                 codepoints_1byte,
+        Encoding::ISO_8859_10 =>                codepoints_1byte,
+        Encoding::ISO_8859_11 =>                codepoints_1byte,
+        Encoding::ISO_8859_13 =>                codepoints_1byte,
+        Encoding::ISO_8859_14 =>                codepoints_1byte,
+        Encoding::ISO_8859_15 =>                codepoints_1byte,
+        Encoding::ISO_8859_16 =>                codepoints_1byte,
+        Encoding::KOI8_R =>                     codepoints_1byte,
+        Encoding::KOI8_U =>                     codepoints_1byte,
+        Encoding::MACCENTEURO =>                codepoints_1byte,
+        Encoding::MACCROATIAN =>                codepoints_1byte,
+        Encoding::MACCYRILLIC =>                codepoints_1byte,
+        Encoding::MACGREEK =>                   codepoints_1byte,
+        Encoding::MACICELAND =>                 codepoints_1byte,
+        Encoding::MACROMAN =>                   codepoints_1byte,
+        Encoding::MACROMANIA =>                 codepoints_1byte,
+        Encoding::MACTHAI =>                    codepoints_1byte,
+        Encoding::MACTURKISH =>                 codepoints_1byte,
+        Encoding::MACUKRAINE =>                 codepoints_1byte,
+        Encoding::TIS_620 =>                    codepoints_1byte,
+        Encoding::Windows_1250 =>               codepoints_1byte,
+        Encoding::Windows_1251 =>               codepoints_1byte,
+        Encoding::Windows_1252 =>               codepoints_1byte,
+        Encoding::Windows_1254 =>               codepoints_1byte,
+        Encoding::Windows_1255 =>               codepoints_1byte,
+        Encoding::Windows_1256 =>               codepoints_1byte,
+        Encoding::Windows_1257 =>               codepoints_1byte,
+        Encoding::Windows_1258 =>               codepoints_1byte,
+        Encoding::Windows_874 =>                codepoints_1byte,
+        Encoding::Big5 =>                       codepoints_2bytes,
+        Encoding::Big5_HKSCS =>                 codepoints_2bytes,
+        Encoding::Big5_UAO =>                   codepoints_2bytes,
+        Encoding::CP949 =>                      codepoints_2bytes,
+        Encoding::CP950 =>                      codepoints_2bytes,
+        Encoding::CP951 =>                      codepoints_2bytes,
+        Encoding::EUC_KR =>                     codepoints_2bytes,
+        Encoding::GBK =>                        codepoints_2bytes,
+        Encoding::GB12345 =>                    codepoints_2bytes,
+        Encoding::GB2312 =>                     codepoints_2bytes,
+        Encoding::MACJAPANESE =>                codepoints_2bytes,
+        Encoding::Shift_JIS =>                  codepoints_2bytes,
+        Encoding::SJIS_DoCoMo =>                codepoints_2bytes,
+        Encoding::SJIS_KDDI =>                  codepoints_2bytes,
+        Encoding::SJIS_SoftBank =>              codepoints_2bytes,
+        Encoding::Windows_31J =>                codepoints_2bytes,
         Encoding::UTF_8 =>                      codepoints_unicode,
         Encoding::UTF8_MAC =>                   codepoints_unicode,
         Encoding::UTF8_DoCoMo =>                codepoints_unicode,
@@ -136,6 +134,8 @@ module Prism
       )
     end
 
+    # These test that we're correctly parsing codepoints for each alias of each
+    # encoding that prism supports.
     encodings.each do |encoding, range|
       encoding.names.each do |name|
         next if name == "locale"
@@ -146,6 +146,17 @@ module Prism
       end
     end
 
+    # These test that we're correctly setting the flags on strings for each
+    # encoding that prism supports.
+    escapes = ["\\x00", "\\x7F", "\\x80", "\\xFF", "\\u{00}", "\\u{7F}", "\\u{80}", "\\M-\\C-?"]
+    escapes = escapes.concat(escapes.product(escapes).map(&:join))
+
+    encodings.each_key do |encoding|
+      define_method(:"test_encoding_flags_#{encoding.name}") do
+        assert_encoding_flags(encoding, escapes)
+      end
+    end
+
     def test_coding
       result = Prism.parse("# coding: utf-8\n'string'")
       actual = result.value.statements.body.first.unescaped.encoding
@@ -292,5 +303,47 @@ module Prism
         refute Prism.parse(source).success?
       end
     end
+
+    def assert_encoding_flags(encoding, escapes)
+      escapes.each do |escaped|
+        source = "# encoding: #{encoding.name}\n\"#{escaped}\""
+
+        expected =
+          begin
+            eval(source).encoding
+          rescue SyntaxError => error
+            if error.message.include?("UTF-8 mixed within")
+              error.message[/: (.+?)\n/, 1]
+            else
+              raise
+            end
+          end
+
+        actual =
+          Prism.parse(source).then do |result|
+            if result.success?
+              string = result.value.statements.body.first
+
+              if string.forced_utf8_encoding?
+                Encoding::UTF_8
+              elsif string.forced_binary_encoding?
+                Encoding::ASCII_8BIT
+              else
+                encoding
+              end
+            else
+              error = result.errors.first
+
+              if error.message.include?("mixed")
+                error.message
+              else
+                raise error.message
+              end
+            end
+          end
+
+        assert_equal expected, actual
+      end
+    end
   end
 end
diff --git a/test/prism/errors_test.rb b/test/prism/errors_test.rb
index 54b710b146..58bb2e3218 100644
--- a/test/prism/errors_test.rb
+++ b/test/prism/errors_test.rb
@@ -659,7 +659,7 @@ module Prism
     end
 
     def test_do_not_allow_multiple_codepoints_in_a_single_character_literal
-      expected = StringNode(0, Location(), Location(), nil, "\u0001\u0002")
+      expected = StringNode(StringFlags::FORCED_UTF8_ENCODING, Location(), Location(), nil, "\u0001\u0002")
 
       assert_errors expected, '?\u{0001 0002}', [
         ["invalid Unicode escape sequence; multiple codepoints are not allowed in a character literal", 9..12]
diff --git a/test/prism/snapshots/arrays.txt b/test/prism/snapshots/arrays.txt
index a9adea9627..c3d4ba1e6c 100644
--- a/test/prism/snapshots/arrays.txt
+++ b/test/prism/snapshots/arrays.txt
@@ -796,6 +796,7 @@
         │   ├── closing_loc: (64,16)-(64,17) = "#"
         │   └── flags: ∅
         ├── @ XStringNode (location: (66,0)-(66,17))
+        │   ├── flags: ∅
         │   ├── opening_loc: (66,0)-(66,3) = "%x#"
         │   ├── content_loc: (66,3)-(66,16) = "one two three"
         │   ├── closing_loc: (66,16)-(66,17) = "#"
@@ -844,6 +845,7 @@
         │   ├── closing_loc: (71,16)-(71,17) = "@"
         │   └── flags: ∅
         ├── @ XStringNode (location: (73,0)-(73,17))
+        │   ├── flags: ∅
         │   ├── opening_loc: (73,0)-(73,3) = "%x@"
         │   ├── content_loc: (73,3)-(73,16) = "one two three"
         │   ├── closing_loc: (73,16)-(73,17) = "@"
@@ -892,6 +894,7 @@
         │   ├── closing_loc: (78,16)-(78,17) = "}"
         │   └── flags: ∅
         ├── @ XStringNode (location: (80,0)-(80,17))
+        │   ├── flags: ∅
         │   ├── opening_loc: (80,0)-(80,3) = "%x{"
         │   ├── content_loc: (80,3)-(80,16) = "one two three"
         │   ├── closing_loc: (80,16)-(80,17) = "}"
diff --git a/test/prism/snapshots/patterns.txt b/test/prism/snapshots/patterns.txt
index ee18448206..05f558d7cb 100644
--- a/test/prism/snapshots/patterns.txt
+++ b/test/prism/snapshots/patterns.txt
@@ -178,6 +178,7 @@
         │   │   └── flags: variable_call
         │   ├── pattern:
         │   │   @ XStringNode (location: (10,7)-(10,12))
+        │   │   ├── flags: ∅
         │   │   ├── opening_loc: (10,7)-(10,8) = "`"
         │   │   ├── content_loc: (10,8)-(10,11) = "foo"
         │   │   ├── closing_loc: (10,11)-(10,12) = "`"
@@ -197,6 +198,7 @@
         │   │   └── flags: variable_call
         │   ├── pattern:
         │   │   @ XStringNode (location: (11,7)-(11,14))
+        │   │   ├── flags: ∅
         │   │   ├── opening_loc: (11,7)-(11,10) = "%x["
         │   │   ├── content_loc: (11,10)-(11,13) = "foo"
         │   │   ├── closing_loc: (11,13)-(11,14) = "]"
@@ -725,12 +727,14 @@
         │   │   @ RangeNode (location: (36,7)-(36,21))
         │   │   ├── left:
         │   │   │   @ XStringNode (location: (36,7)-(36,12))
+        │   │   │   ├── flags: ∅
         │   │   │   ├── opening_loc: (36,7)-(36,8) = "`"
         │   │   │   ├── content_loc: (36,8)-(36,11) = "foo"
         │   │   │   ├── closing_loc: (36,11)-(36,12) = "`"
         │   │   │   └── unescaped: "foo"
         │   │   ├── right:
         │   │   │   @ XStringNode (location: (36,16)-(36,21))
+        │   │   │   ├── flags: ∅
         │   │   │   ├── opening_loc: (36,16)-(36,17) = "`"
         │   │   │   ├── content_loc: (36,17)-(36,20) = "foo"
         │   │   │   ├── closing_loc: (36,20)-(36,21) = "`"
@@ -754,12 +758,14 @@
         │   │   @ RangeNode (location: (37,7)-(37,25))
         │   │   ├── left:
         │   │   │   @ XStringNode (location: (37,7)-(37,14))
+        │   │   │   ├── flags: ∅
         │   │   │   ├── opening_loc: (37,7)-(37,10) = "%x["
         │   │   │   ├── content_loc: (37,10)-(37,13) = "foo"
         │   │   │   ├── closing_loc: (37,13)-(37,14) = "]"
         │   │   │   └── unescaped: "foo"
         │   │   ├── right:
         │   │   │   @ XStringNode (location: (37,18)-(37,25))
+        │   │   │   ├── flags: ∅
         │   │   │   ├── opening_loc: (37,18)-(37,21) = "%x["
         │   │   │   ├── content_loc: (37,21)-(37,24) = "foo"
         │   │   │   ├── closing_loc: (37,24)-(37,25) = "]"
@@ -2483,6 +2489,7 @@
         │   │   └── flags: variable_call
         │   ├── pattern:
         │   │   @ XStringNode (location: (109,7)-(109,12))
+        │   │   ├── flags: ∅
         │   │   ├── opening_loc: (109,7)-(109,8) = "`"
         │   │   ├── content_loc: (109,8)-(109,11) = "foo"
         │   │   ├── closing_loc: (109,11)-(109,12) = "`"
@@ -2502,6 +2509,7 @@
         │   │   └── flags: variable_call
         │   ├── pattern:
         │   │   @ XStringNode (location: (110,7)-(110,14))
+        │   │   ├── flags: ∅
         │   │   ├── opening_loc: (110,7)-(110,10) = "%x["
         │   │   ├── content_loc: (110,10)-(110,13) = "foo"
         │   │   ├── closing_loc: (110,13)-(110,14) = "]"
@@ -3038,6 +3046,7 @@
         │   │   └── @ InNode (location: (136,10)-(136,23))
         │   │       ├── pattern:
         │   │       │   @ XStringNode (location: (136,13)-(136,18))
+        │   │       │   ├── flags: ∅
         │   │       │   ├── opening_loc: (136,13)-(136,14) = "`"
         │   │       │   ├── content_loc: (136,14)-(136,17) = "foo"
         │   │       │   ├── closing_loc: (136,17)-(136,18) = "`"
@@ -3064,6 +3073,7 @@
         │   │   └── @ InNode (location: (137,10)-(137,25))
         │   │       ├── pattern:
         │   │       │   @ XStringNode (location: (137,13)-(137,20))
+        │   │       │   ├── flags: ∅
         │   │       │   ├── opening_loc: (137,13)-(137,16) = "%x["
         │   │       │   ├── content_loc: (137,16)-(137,19) = "foo"
         │   │       │   ├── closing_loc: (137,19)-(137,20) = "]"
@@ -3828,6 +3838,7 @@
         │   │       │   │   @ StatementsNode (location: (163,13)-(163,18))
         │   │       │   │   └── body: (length: 1)
         │   │       │   │       └── @ XStringNode (location: (163,13)-(163,18))
+        │   │       │   │           ├── flags: ∅
         │   │       │   │           ├── opening_loc: (163,13)-(163,14) = "`"
         │   │       │   │           ├── content_loc: (163,14)-(163,17) = "foo"
         │   │       │   │           ├── closing_loc: (163,17)-(163,18) = "`"
@@ -3866,6 +3877,7 @@
         │   │       │   │   @ StatementsNode (location: (164,13)-(164,20))
         │   │       │   │   └── body: (length: 1)
         │   │       │   │       └── @ XStringNode (location: (164,13)-(164,20))
+        │   │       │   │           ├── flags: ∅
         │   │       │   │           ├── opening_loc: (164,13)-(164,16) = "%x["
         │   │       │   │           ├── content_loc: (164,16)-(164,19) = "foo"
         │   │       │   │           ├── closing_loc: (164,19)-(164,20) = "]"
diff --git a/test/prism/snapshots/seattlerb/case_in.txt b/test/prism/snapshots/seattlerb/case_in.txt
index 9134e2cb52..e66b4597b2 100644
--- a/test/prism/snapshots/seattlerb/case_in.txt
+++ b/test/prism/snapshots/seattlerb/case_in.txt
@@ -806,6 +806,7 @@
         │   │   └── @ InNode (location: (98,0)-(98,12))
         │   │       ├── pattern:
         │   │       │   @ XStringNode (location: (98,3)-(98,12))
+        │   │       │   ├── flags: ∅
         │   │       │   ├── opening_loc: (98,3)-(98,4) = "`"
         │   │       │   ├── content_loc: (98,4)-(98,11) = "echo hi"
         │   │       │   ├── closing_loc: (98,11)-(98,12) = "`"
diff --git a/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt b/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt
index e97c1fd7f4..2b1d776404 100644
--- a/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_bad_hex_escape.txt
@@ -9,7 +9,7 @@
             ├── name_loc: (1,0)-(1,1) = "s"
             ├── value:
             │   @ StringNode (location: (1,4)-(1,9))
-            │   ├── flags: ∅
+            │   ├── flags: forced_utf8_encoding
             │   ├── opening_loc: (1,4)-(1,9) = "<<eos"
             │   ├── content_loc: (2,0)-(3,0) = "a\\xE9b\n"
             │   ├── closing_loc: (3,0)-(4,0) = "eos\n"
diff --git a/test/prism/snapshots/seattlerb/heredoc_bad_oct_escape.txt b/test/prism/snapshots/seattlerb/heredoc_bad_oct_escape.txt
index 223a32e298..7a01f8d6d1 100644
--- a/test/prism/snapshots/seattlerb/heredoc_bad_oct_escape.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_bad_oct_escape.txt
@@ -9,7 +9,7 @@
             ├── name_loc: (1,0)-(1,1) = "s"
             ├── value:
             │   @ StringNode (location: (1,4)-(1,10))
-            │   ├── flags: ∅
+            │   ├── flags: forced_utf8_encoding
             │   ├── opening_loc: (1,4)-(1,10) = "<<-EOS"
             │   ├── content_loc: (2,0)-(4,0) = "a\\247b\ncöd\n"
             │   ├── closing_loc: (4,0)-(5,0) = "EOS\n"
diff --git a/test/prism/snapshots/seattlerb/read_escape_unicode_curlies.txt b/test/prism/snapshots/seattlerb/read_escape_unicode_curlies.txt
index 4c4b6b9528..3ea5604b69 100644
--- a/test/prism/snapshots/seattlerb/read_escape_unicode_curlies.txt
+++ b/test/prism/snapshots/seattlerb/read_escape_unicode_curlies.txt
@@ -4,7 +4,7 @@
     @ StatementsNode (location: (1,0)-(1,9))
     └── body: (length: 1)
         └── @ StringNode (location: (1,0)-(1,9))
-            ├── flags: ∅
+            ├── flags: forced_utf8_encoding
             ├── opening_loc: (1,0)-(1,1) = "?"
             ├── content_loc: (1,1)-(1,9) = "\\u{00a0}"
             ├── closing_loc: ∅
diff --git a/test/prism/snapshots/seattlerb/read_escape_unicode_h4.txt b/test/prism/snapshots/seattlerb/read_escape_unicode_h4.txt
index 4bdf2cd2f9..1eba1396fd 100644
--- a/test/prism/snapshots/seattlerb/read_escape_unicode_h4.txt
+++ b/test/prism/snapshots/seattlerb/read_escape_unicode_h4.txt
@@ -4,7 +4,7 @@
     @ StatementsNode (location: (1,0)-(1,7))
     └── body: (length: 1)
         └── @ StringNode (location: (1,0)-(1,7))
-            ├── flags: ∅
+            ├── flags: forced_utf8_encoding
             ├── opening_loc: (1,0)-(1,1) = "?"
             ├── content_loc: (1,1)-(1,7) = "\\u00a0"
             ├── closing_loc: ∅
diff --git a/test/prism/snapshots/seattlerb/str_evstr_escape.txt b/test/prism/snapshots/seattlerb/str_evstr_escape.txt
index 49891a9e96..a71ab57cf5 100644
--- a/test/prism/snapshots/seattlerb/str_evstr_escape.txt
+++ b/test/prism/snapshots/seattlerb/str_evstr_escape.txt
@@ -29,7 +29,7 @@
             │   │   │           └── flags: variable_call
             │   │   └── closing_loc: (1,6)-(1,7) = "}"
             │   └── @ StringNode (location: (1,7)-(1,15))
-            │       ├── flags: ∅
+            │       ├── flags: forced_utf8_encoding
             │       ├── opening_loc: ∅
             │       ├── content_loc: (1,7)-(1,15) = "\\302\\275"
             │       ├── closing_loc: ∅
diff --git a/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt b/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt
index f1226f34e2..b841407cd8 100644
--- a/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt
+++ b/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt
@@ -7,13 +7,13 @@
             ├── opening_loc: ∅
             ├── parts: (length: 2)
             │   ├── @ StringNode (location: (1,0)-(1,62))
-            │   │   ├── flags: ∅
+            │   │   ├── flags: forced_utf8_encoding
             │   │   ├── opening_loc: (1,0)-(1,1) = "\""
             │   │   ├── content_loc: (1,1)-(1,61) = "\\xE3\\xD3\\x8B\\xE3\\x83\\xBC\\x83\\xE3\\x83\\xE3\\x82\\xB3\\xA3\\x82\\x99"
             │   │   ├── closing_loc: (1,61)-(1,62) = "\""
             │   │   └── unescaped: "\xE3Ӌー\x83\xE3\x83コ\xA3\x82\x99"
             │   └── @ StringNode (location: (2,8)-(2,66))
-            │       ├── flags: ∅
+            │       ├── flags: forced_utf8_encoding
             │       ├── opening_loc: (2,8)-(2,9) = "\""
             │       ├── content_loc: (2,9)-(2,65) = "\\xE3\\x83\\xB3\\xE3\\x83\\x8F\\xE3\\x82\\x9A\\xC3\\xBD;foo@bar.com"
             │       ├── closing_loc: (2,65)-(2,66) = "\""
diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt
index e019b65229..eb7e734787 100644
--- a/test/prism/snapshots/unparser/corpus/literal/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt
@@ -472,6 +472,7 @@
         │   ├── closing_loc: (37,9)-(37,10) = "\""
         │   └── unescaped: "foo\nbar"
         ├── @ XStringNode (location: (38,0)-(38,5))
+        │   ├── flags: ∅
         │   ├── opening_loc: (38,0)-(38,1) = "`"
         │   ├── content_loc: (38,1)-(38,4) = "foo"
         │   ├── closing_loc: (38,4)-(38,5) = "`"
@@ -495,16 +496,19 @@
         │   │       └── closing_loc: (39,10)-(39,11) = "}"
         │   └── closing_loc: (39,11)-(39,12) = "`"
         ├── @ XStringNode (location: (40,0)-(40,3))
+        │   ├── flags: ∅
         │   ├── opening_loc: (40,0)-(40,1) = "`"
         │   ├── content_loc: (40,1)-(40,2) = ")"
         │   ├── closing_loc: (40,2)-(40,3) = "`"
         │   └── unescaped: ")"
         ├── @ XStringNode (location: (41,0)-(41,4))
+        │   ├── flags: ∅
         │   ├── opening_loc: (41,0)-(41,1) = "`"
         │   ├── content_loc: (41,1)-(41,3) = "\\`"
         │   ├── closing_loc: (41,3)-(41,4) = "`"
         │   └── unescaped: "`"
         ├── @ XStringNode (location: (42,0)-(42,3))
+        │   ├── flags: ∅
         │   ├── opening_loc: (42,0)-(42,1) = "`"
         │   ├── content_loc: (42,1)-(42,2) = "\""
         │   ├── closing_loc: (42,2)-(42,3) = "`"
diff --git a/test/prism/snapshots/whitequark/bug_ascii_8bit_in_literal.txt b/test/prism/snapshots/whitequark/bug_ascii_8bit_in_literal.txt
index 65d733166a..7aa8694f66 100644
--- a/test/prism/snapshots/whitequark/bug_ascii_8bit_in_literal.txt
+++ b/test/prism/snapshots/whitequark/bug_ascii_8bit_in_literal.txt
@@ -4,7 +4,7 @@
     @ StatementsNode (location: (2,9)-(2,75))
     └── body: (length: 1)
         └── @ StringNode (location: (2,9)-(2,75))
-            ├── flags: ∅
+            ├── flags: forced_utf8_encoding
             ├── opening_loc: (2,9)-(2,10) = "\""
             ├── content_loc: (2,10)-(2,74) = "\\xD0\\xBF\\xD1\\x80\\xD0\\xBE\\xD0\\xB2\\xD0\\xB5\\xD1\\x80\\xD0\\xBA\\xD0\\xB0"
             ├── closing_loc: (2,74)-(2,75) = "\""
diff --git a/test/prism/snapshots/whitequark/heredoc.txt b/test/prism/snapshots/whitequark/heredoc.txt
index 0d718c6945..86543097ee 100644
--- a/test/prism/snapshots/whitequark/heredoc.txt
+++ b/test/prism/snapshots/whitequark/heredoc.txt
@@ -16,6 +16,7 @@
         │   ├── closing_loc: (9,0)-(10,0) = "HERE\n"
         │   └── unescaped: "foo\nbar\n"
         └── @ XStringNode (location: (11,0)-(11,8))
+            ├── flags: ∅
             ├── opening_loc: (11,0)-(11,8) = "<<`HERE`"
             ├── content_loc: (12,0)-(14,0) = "foo\nbar\n"
             ├── closing_loc: (14,0)-(15,0) = "HERE\n"
diff --git a/test/prism/snapshots/whitequark/interp_digit_var.txt b/test/prism/snapshots/whitequark/interp_digit_var.txt
index 5796357fd3..d493027e44 100644
--- a/test/prism/snapshots/whitequark/interp_digit_var.txt
+++ b/test/prism/snapshots/whitequark/interp_digit_var.txt
@@ -146,11 +146,13 @@
         │   ├── closing_loc: (35,10)-(35,11) = "]"
         │   └── flags: ∅
         ├── @ XStringNode (location: (37,1)-(37,8))
+        │   ├── flags: ∅
         │   ├── opening_loc: (37,1)-(37,4) = "%x{"
         │   ├── content_loc: (37,4)-(37,7) = "\#@1"
         │   ├── closing_loc: (37,7)-(37,8) = "}"
         │   └── unescaped: "\#@1"
         ├── @ XStringNode (location: (39,1)-(39,9))
+        │   ├── flags: ∅
         │   ├── opening_loc: (39,1)-(39,4) = "%x{"
         │   ├── content_loc: (39,4)-(39,8) = "\#@@1"
         │   ├── closing_loc: (39,8)-(39,9) = "}"
@@ -212,11 +214,13 @@
         │   ├── closing_loc: (59,7)-(59,8) = "'"
         │   └── unescaped: "\#@@1"
         ├── @ XStringNode (location: (61,1)-(61,6))
+        │   ├── flags: ∅
         │   ├── opening_loc: (61,1)-(61,2) = "`"
         │   ├── content_loc: (61,2)-(61,5) = "\#@1"
         │   ├── closing_loc: (61,5)-(61,6) = "`"
         │   └── unescaped: "\#@1"
         ├── @ XStringNode (location: (63,1)-(63,7))
+        │   ├── flags: ∅
         │   ├── opening_loc: (63,1)-(63,2) = "`"
         │   ├── content_loc: (63,2)-(63,6) = "\#@@1"
         │   ├── closing_loc: (63,6)-(63,7) = "`"
@@ -246,11 +250,13 @@
         │   ├── closing_loc: (79,0)-(80,0) = "HERE\n"
         │   └── unescaped: "\#@@1\n"
         ├── @ XStringNode (location: (81,0)-(81,9))
+        │   ├── flags: ∅
         │   ├── opening_loc: (81,0)-(81,9) = "<<-`HERE`"
         │   ├── content_loc: (82,0)-(83,0) = "\#@1\n"
         │   ├── closing_loc: (83,0)-(84,0) = "HERE\n"
         │   └── unescaped: "\#@1\n"
         └── @ XStringNode (location: (85,0)-(85,9))
+            ├── flags: ∅
             ├── opening_loc: (85,0)-(85,9) = "<<-`HERE`"
             ├── content_loc: (86,0)-(87,0) = "\#@@1\n"
             ├── closing_loc: (87,0)-(88,0) = "HERE\n"
diff --git a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
index e86098e7ba..5b4b9863db 100644
--- a/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
+++ b/test/prism/snapshots/whitequark/parser_slash_slash_n_escaping_in_literals.txt
@@ -75,6 +75,7 @@
         │   ├── closing_loc: (26,1)-(26,2) = "}"
         │   └── flags: ∅
         ├── @ XStringNode (location: (28,0)-(29,2))
+        │   ├── flags: ∅
         │   ├── opening_loc: (28,0)-(28,3) = "%x{"
         │   ├── content_loc: (28,3)-(29,1) = "a\\\nb"
         │   ├── closing_loc: (29,1)-(29,2) = "}"
@@ -120,11 +121,13 @@
         │   ├── closing_loc: (54,0)-(55,0) = "HERE\n"
         │   └── unescaped: "a\\\nb\n"
         ├── @ XStringNode (location: (56,0)-(56,9))
+        │   ├── flags: ∅
         │   ├── opening_loc: (56,0)-(56,9) = "<<-`HERE`"
         │   ├── content_loc: (57,0)-(59,0) = "a\\\nb\n"
         │   ├── closing_loc: (59,0)-(60,0) = "HERE\n"
         │   └── unescaped: "ab\n"
         └── @ XStringNode (location: (61,0)-(62,2))
+            ├── flags: ∅
             ├── opening_loc: (61,0)-(61,1) = "`"
             ├── content_loc: (61,1)-(62,1) = "a\\\nb"
             ├── closing_loc: (62,1)-(62,2) = "`"
diff --git a/test/prism/snapshots/whitequark/xstring_plain.txt b/test/prism/snapshots/whitequark/xstring_plain.txt
index 2546f9829f..97084286d9 100644
--- a/test/prism/snapshots/whitequark/xstring_plain.txt
+++ b/test/prism/snapshots/whitequark/xstring_plain.txt
@@ -4,6 +4,7 @@
     @ StatementsNode (location: (1,0)-(1,8))
     └── body: (length: 1)
         └── @ XStringNode (location: (1,0)-(1,8))
+            ├── flags: ∅
             ├── opening_loc: (1,0)-(1,1) = "`"
             ├── content_loc: (1,1)-(1,7) = "foobar"
             ├── closing_loc: (1,7)-(1,8) = "`"
diff --git a/test/prism/snapshots/xstring.txt b/test/prism/snapshots/xstring.txt
index 6cfa9a350e..c8dba2a652 100644
--- a/test/prism/snapshots/xstring.txt
+++ b/test/prism/snapshots/xstring.txt
@@ -4,6 +4,7 @@
     @ StatementsNode (location: (1,0)-(7,5))
     └── body: (length: 4)
         ├── @ XStringNode (location: (1,0)-(1,7))
+        │   ├── flags: ∅
         │   ├── opening_loc: (1,0)-(1,3) = "%x["
         │   ├── content_loc: (1,3)-(1,6) = "foo"
         │   ├── closing_loc: (1,6)-(1,7) = "]"
@@ -41,11 +42,13 @@
         │   │       └── unescaped: " baz"
         │   └── closing_loc: (3,15)-(3,16) = "`"
         ├── @ XStringNode (location: (5,0)-(5,6))
+        │   ├── flags: ∅
         │   ├── opening_loc: (5,0)-(5,1) = "`"
         │   ├── content_loc: (5,1)-(5,5) = "f\\oo"
         │   ├── closing_loc: (5,5)-(5,6) = "`"
         │   └── unescaped: "foo"
         └── @ XStringNode (location: (7,0)-(7,5))
+            ├── flags: ∅
             ├── opening_loc: (7,0)-(7,1) = "`"
             ├── content_loc: (7,1)-(7,4) = "foo"
             ├── closing_loc: (7,4)-(7,5) = "`"
author	Kevin Newton <kddnewton@gmail.com>	2023-12-04 12:51:22 -0500
committer	Kevin Newton <kddnewton@gmail.com>	2023-12-06 14:23:38 -0500
commit	82f18baa21d0df59c30d8a6e60bf3e0991de1114 (patch)
tree	d861044ddaf3d334fee10325f15eab9887ae546b
parent	9620ca678929f28dd8dab8e278e438a430a85022 (diff)