summaryrefslogtreecommitdiff
path: root/prism_compile.c
diff options
context:
space:
mode:
authoreileencodes <eileencodes@gmail.com>2023-10-24 13:17:18 -0400
committerAaron Patterson <aaron.patterson@gmail.com>2023-10-26 11:11:52 -0700
commita082e560bb3b875dfcce5ff4743a04e76e008d46 (patch)
tree2792062e7e4971bd07bee7bd2813d88a04baae19 /prism_compile.c
parent2573d568848f1dc54e6fb4d9ce203d75cc31fc3d (diff)
[PRISM] Implement regex encoding flags
Added the correct encoding to the allocated regex. This required making a new method to set the encoding and pass that to `rb_enc_reg_new` instead of `rb_reg_new`. The former `rb_reg_new` would set the encoding to ASCII8BIT regardless of encoding flag.
Diffstat (limited to 'prism_compile.c')
-rw-r--r--prism_compile.c62
1 files changed, 57 insertions, 5 deletions
diff --git a/prism_compile.c b/prism_compile.c
index 52f75faaf0..2a2483e11c 100644
--- a/prism_compile.c
+++ b/prism_compile.c
@@ -158,7 +158,7 @@ parse_imaginary(pm_imaginary_node_t *node)
}
static inline VALUE
-parse_string(pm_string_t *string, pm_parser_t *parser)
+parse_string(pm_string_t *string, const pm_parser_t *parser)
{
rb_encoding *enc = rb_enc_from_index(rb_enc_find_index(parser->encoding.name));
return rb_enc_str_new((const char *) pm_string_source(string), pm_string_length(string), enc);
@@ -190,6 +190,8 @@ pm_optimizable_range_item_p(pm_node_t *node)
return (!node || PM_NODE_TYPE_P(node, PM_INTEGER_NODE) || PM_NODE_TYPE_P(node, PM_NIL_NODE));
}
+#define RE_OPTION_ENCODING_SHIFT 8
+
/**
* Check the prism flags of a regular expression-like node and return the flags
* that are expected by the CRuby VM.
@@ -197,6 +199,29 @@ pm_optimizable_range_item_p(pm_node_t *node)
static int
pm_reg_flags(const pm_node_t *node) {
int flags = 0;
+ int dummy = 0;
+
+ // Check "no encoding" first so that flags don't get clobbered
+ // We're calling `rb_char_to_option_kcode` in this case so that
+ // we don't need to have access to `ARG_ENCODING_NONE`
+ if (node->flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+ rb_char_to_option_kcode('n', &flags, &dummy);
+ }
+
+ if (node->flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+ rb_char_to_option_kcode('e', &flags, &dummy);
+ flags |= ('e' << RE_OPTION_ENCODING_SHIFT);
+ }
+
+ if (node->flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+ rb_char_to_option_kcode('s', &flags, &dummy);
+ flags |= ('s' << RE_OPTION_ENCODING_SHIFT);
+ }
+
+ if (node->flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+ rb_char_to_option_kcode('u', &flags, &dummy);
+ flags |= ('u' << RE_OPTION_ENCODING_SHIFT);
+ }
if (node->flags & PM_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE) {
flags |= ONIG_OPTION_IGNORECASE;
@@ -213,6 +238,27 @@ pm_reg_flags(const pm_node_t *node) {
return flags;
}
+static rb_encoding *
+pm_reg_enc(const pm_regular_expression_node_t *node, const pm_parser_t *parser) {
+ if (node->base.flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+ return rb_ascii8bit_encoding();
+ }
+
+ if (node->base.flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+ return rb_enc_get_from_index(ENCINDEX_EUC_JP);
+ }
+
+ if (node->base.flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+ return rb_enc_get_from_index(ENCINDEX_Windows_31J);
+ }
+
+ if (node->base.flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+ return rb_utf8_encoding();
+ }
+
+ return rb_enc_from_index(rb_enc_find_index(parser->encoding.name));
+}
+
/**
* Certain nodes can be compiled literally, which can lead to further
* optimizations. These nodes will all have the PM_NODE_FLAG_STATIC_LITERAL flag
@@ -224,6 +270,14 @@ pm_static_literal_p(const pm_node_t *node)
return node->flags & PM_NODE_FLAG_STATIC_LITERAL;
}
+static VALUE
+pm_new_regex(pm_regular_expression_node_t * cast, const pm_parser_t * parser) {
+ VALUE regex_str = parse_string(&cast->unescaped, parser);
+ rb_encoding * enc = pm_reg_enc(cast, parser);
+
+ return rb_enc_reg_new(RSTRING_PTR(regex_str), RSTRING_LEN(regex_str), enc, pm_reg_flags((const pm_node_t *)cast));
+}
+
/**
* Certain nodes can be compiled literally. This function returns the literal
* value described by the given node. For example, an array node with all static
@@ -283,8 +337,7 @@ pm_static_literal_value(const pm_node_t *node, pm_scope_node_t *scope_node, pm_p
case PM_REGULAR_EXPRESSION_NODE: {
pm_regular_expression_node_t *cast = (pm_regular_expression_node_t *) node;
- VALUE string = parse_string(&cast->unescaped, parser);
- return rb_reg_new(RSTRING_PTR(string), RSTRING_LEN(string), pm_reg_flags(node));
+ return pm_new_regex(cast, parser);
}
case PM_SOURCE_ENCODING_NODE: {
rb_encoding *encoding = rb_find_encoding(rb_str_new_cstr(scope_node->parser->encoding.name));
@@ -2797,8 +2850,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret,
if (!popped) {
pm_regular_expression_node_t *cast = (pm_regular_expression_node_t *) node;
- VALUE regex_str = parse_string(&cast->unescaped, parser);
- VALUE regex = rb_reg_new(RSTRING_PTR(regex_str), RSTRING_LEN(regex_str), pm_reg_flags(node));
+ VALUE regex = pm_new_regex(cast, parser);
ADD_INSN1(ret, &dummy_line_node, putobject, regex);
}