summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2023-12-06 12:54:16 -0500
committerKevin Newton <kddnewton@gmail.com>2023-12-06 14:23:38 -0500
commit153c09f24be77b8db986c2914c141b5bd8f51f67 (patch)
treee4af0d06038b8f300f203b5873ddda7d0d895e1f
parent82f18baa21d0df59c30d8a6e60bf3e0991de1114 (diff)
[prism] Handle string and xstring encodings
-rw-r--r--prism_compile.c41
1 files changed, 32 insertions, 9 deletions
diff --git a/prism_compile.c b/prism_compile.c
index e70fa75aae..7540a95ba7 100644
--- a/prism_compile.c
+++ b/prism_compile.c
@@ -182,14 +182,34 @@ parse_imaginary(pm_imaginary_node_t *node)
static inline VALUE
parse_string(pm_string_t *string, const pm_parser_t *parser)
{
- rb_encoding *enc = rb_enc_from_index(rb_enc_find_index(parser->encoding.name));
+ rb_encoding *enc = rb_enc_from_index(rb_enc_find_index(parser->encoding->name));
return rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), enc);
}
+/**
+ * Certain strings can have their encoding differ from the parser's encoding due
+ * to bytes or escape sequences that have the top bit set. This function handles
+ * creating those strings based on the flags set on the owning node.
+ */
+static inline VALUE
+parse_string_encoded(const pm_node_t *node, const pm_string_t *string, const pm_parser_t *parser) {
+ rb_encoding *encoding;
+
+ if (node->flags & PM_ENCODING_FLAGS_FORCED_BINARY_ENCODING) {
+ encoding = rb_ascii8bit_encoding();
+ } else if (node->flags & PM_ENCODING_FLAGS_FORCED_UTF8_ENCODING) {
+ encoding = rb_utf8_encoding();
+ } else {
+ encoding = rb_enc_from_index(rb_enc_find_index(parser->encoding->name));
+ }
+
+ return rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), encoding);
+}
+
static inline ID
parse_symbol(const uint8_t *start, const uint8_t *end, pm_parser_t *parser)
{
- rb_encoding *enc = rb_enc_from_index(rb_enc_find_index(parser->encoding.name));
+ rb_encoding *enc = rb_enc_from_index(rb_enc_find_index(parser->encoding->name));
return rb_intern3((const char *) start, end - start, enc);
}
@@ -278,7 +298,7 @@ pm_reg_enc(const pm_regular_expression_node_t *node, const pm_parser_t *parser)
return rb_utf8_encoding();
}
- return rb_enc_from_index(rb_enc_find_index(parser->encoding.name));
+ return rb_enc_from_index(rb_enc_find_index(parser->encoding->name));
}
/**
@@ -362,7 +382,7 @@ pm_static_literal_value(const pm_node_t *node, pm_scope_node_t *scope_node, pm_p
return pm_new_regex(cast, parser);
}
case PM_SOURCE_ENCODING_NODE: {
- rb_encoding *encoding = rb_find_encoding(rb_str_new_cstr(scope_node->parser->encoding.name));
+ rb_encoding *encoding = rb_find_encoding(rb_str_new_cstr(scope_node->parser->encoding->name));
if (!encoding) rb_bug("Encoding not found!");
return rb_enc_from_encoding(encoding);
}
@@ -4431,8 +4451,9 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
}
case PM_STRING_NODE: {
if (!popped) {
- pm_string_node_t *string_node = (pm_string_node_t *) node;
- ADD_INSN1(ret, &dummy_line_node, putstring, parse_string(&string_node->unescaped, parser));
+ pm_string_node_t *cast = (pm_string_node_t *) node;
+ VALUE value = parse_string_encoded(node, &cast->unescaped, parser);
+ ADD_INSN1(ret, &dummy_line_node, putstring, value);
}
return;
}
@@ -4553,9 +4574,11 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
return;
}
case PM_X_STRING_NODE: {
- pm_x_string_node_t *xstring_node = (pm_x_string_node_t *) node;
+ pm_x_string_node_t *cast = (pm_x_string_node_t *) node;
+ VALUE value = parse_string_encoded(node, &cast->unescaped, parser);
+
PM_PUTSELF;
- ADD_INSN1(ret, &dummy_line_node, putobject, parse_string(&xstring_node->unescaped, parser));
+ ADD_INSN1(ret, &dummy_line_node, putobject, value);
ADD_SEND_WITH_FLAG(ret, &dummy_line_node, idBackquote, INT2NUM(1), INT2FIX(VM_CALL_FCALL | VM_CALL_ARGS_SIMPLE));
PM_POP_IF_POPPED;
@@ -4599,7 +4622,7 @@ rb_translate_prism(pm_parser_t *parser, rb_iseq_t *iseq, pm_scope_node_t *scope_
RUBY_ASSERT(ISEQ_COMPILE_DATA(iseq));
ID *constants = calloc(parser->constant_pool.size, sizeof(ID));
- rb_encoding *encoding = rb_enc_find(parser->encoding.name);
+ rb_encoding *encoding = rb_enc_find(parser->encoding->name);
for (uint32_t index = 0; index < parser->constant_pool.size; index++) {
pm_constant_t *constant = &parser->constant_pool.constants[index];
constants[index] = rb_intern3((const char *) constant->start, constant->length, encoding);