1 files changed, 971 insertions, 44 deletions
diff --git a/prism/regexp.c b/prism/regexp.c
index dcc7476244..cc17aa4d09 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -1,5 +1,20 @@
-#include "prism/regexp.h"
-
+#include "prism/internal/regexp.h"
+
+#include "prism/compiler/inline.h"
+#include "prism/compiler/fallthrough.h"
+#include "prism/internal/buffer.h"
+#include "prism/internal/char.h"
+#include "prism/internal/diagnostic.h"
+#include "prism/internal/encoding.h"
+#include "prism/internal/memchr.h"
+#include "prism/internal/parser.h"
+#include "prism/internal/stringy.h"
+#include "prism/internal/strncasecmp.h"
+
+#include <assert.h>
+#include <string.h>
+
+/** The maximum depth of nested groups allowed in a regular expression. */
 #define PM_REGEXP_PARSE_DEPTH_MAX 4096
 
 /**
@@ -18,6 +33,54 @@ typedef struct {
     /** A pointer to the end of the source that we are parsing. */
     const uint8_t *end;
 
+    /** The encoding of the source. */
+    const pm_encoding_t *encoding;
+
+    /** The callback to call when a named capture group is found. */
+    pm_regexp_name_callback_t name_callback;
+
+    /** The data to pass to the name callback. */
+    pm_regexp_name_data_t *name_data;
+
+    /** The start of the regexp node (for error locations). */
+    const uint8_t *node_start;
+
+    /** The end of the regexp node (for error locations). */
+    const uint8_t *node_end;
+
+    /**
+     * The explicit encoding determined by escape sequences. NULL if no
+     * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the
+     * source encoding for `\x` escapes.
+     */
+    const pm_encoding_t *explicit_encoding;
+
+    /**
+     * Pointer to the first non-POSIX property name (for /n error messages).
+     * POSIX properties (Alnum, Alpha, etc.) work in all encodings.
+     * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u.
+     * Unicode-only properties (L, Ll, etc.) work only in /u.
+     */
+    const uint8_t *property_name;
+
+    /** Length of the first non-POSIX property name found. */
+    size_t property_name_length;
+
+    /**
+     * Pointer to the first Unicode-only property name (for /e, /s error
+     * messages). NULL if only POSIX or script properties have been seen.
+     */
+    const uint8_t *unicode_property_name;
+
+    /** Length of the first Unicode-only property name found. */
+    size_t unicode_property_name_length;
+
+    /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */
+    pm_buffer_t hex_escape_buffer;
+
+    /** Count of non-ASCII literal bytes (not from escapes). */
+    uint32_t non_ascii_literal_count;
+
     /**
      * Whether or not the regular expression currently being parsed is in
      * extended mode, wherein whitespace is ignored and comments are allowed.
@@ -27,31 +90,77 @@ typedef struct {
     /** Whether the encoding has changed from the default. */
     bool encoding_changed;
 
-    /** The encoding of the source. */
-    const pm_encoding_t *encoding;
+    /** Whether the source content is shared (for named capture callback). */
+    bool shared;
 
-    /** The callback to call when a named capture group is found. */
-    pm_regexp_name_callback_t name_callback;
+    /** Whether a `\u{...}` escape with value >= 0x80 was seen. */
+    bool has_unicode_escape;
 
-    /** The data to pass to the name callback. */
-    void *name_data;
+    /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */
+    bool has_hex_escape;
+
+    /**
+     * Tracks whether the last encoding-setting escape was `\u` (true) or `\x`
+     * (false). This matters for error messages when both types are mixed.
+     */
+    bool last_escape_was_unicode;
+
+    /** Whether any `\p{...}` or `\P{...}` property escape was found. */
+    bool has_property_escape;
+
+    /** Whether a Unicode-only property escape was found (not POSIX or script). */
+    bool has_unicode_property_escape;
 
-    /** The callback to call when a parse error is found. */
-    pm_regexp_error_callback_t error_callback;
+    /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */
+    bool invalid_unicode_range;
 
-    /** The data to pass to the error callback. */
-    void *error_data;
+    /** Whether we are accumulating consecutive hex escape bytes. */
+    bool hex_group_active;
+
+    /** Whether an invalid multibyte character was found during parsing. */
+    bool has_invalid_multibyte;
 } pm_regexp_parser_t;
 
 /**
- * Append an error to the parser.
+ * Append a syntax error to the parser's error list. If the source is shared
+ * (points into the original source), we can point to the exact error location.
+ * Otherwise, we point to the whole regexp node.
  */
-static inline void
+static PRISM_INLINE void
 pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
-    parser->error_callback(start, end, message, parser->error_data);
+    pm_parser_t *pm = parser->parser;
+    uint32_t loc_start, loc_length;
+
+    if (parser->shared) {
+        loc_start = (uint32_t) (start - pm->start);
+        loc_length = (uint32_t) (end - start);
+    } else {
+        loc_start = (uint32_t) (parser->node_start - pm->start);
+        loc_length = (uint32_t) (parser->node_end - parser->node_start);
+    }
+
+    pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message);
 }
 
 /**
+ * Append a formatted diagnostic error with proper shared/non-shared location
+ * handling. This is a macro because we need variadic args for the format string.
+ */
+#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \
+    do { \
+        pm_parser_t *pm__ = (parser_)->parser; \
+        uint32_t loc_start__, loc_length__; \
+        if ((parser_)->shared) { \
+            loc_start__ = (uint32_t) ((err_start_) - pm__->start); \
+            loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \
+        } else { \
+            loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \
+            loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \
+        } \
+        pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \
+    } while (0)
+
+/**
  * This appends a new string to the list of named captures. This function
  * assumes the caller has already checked the validity of the name callback.
  */
@@ -59,14 +168,14 @@ static void
 pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
     pm_string_t string;
     pm_string_shared_init(&string, start, end);
-    parser->name_callback(&string, parser->name_data);
-    pm_string_free(&string);
+    parser->name_callback(parser->parser, &string, parser->shared, parser->name_data);
+    pm_string_cleanup(&string);
 }
 
 /**
  * Returns true if the next character is the end of the source.
  */
-static inline bool
+static PRISM_INLINE bool
 pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
     return parser->cursor >= parser->end;
 }
@@ -74,7 +183,7 @@ pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
 /**
  * Optionally accept a char and consume it if it exists.
  */
-static inline bool
+static PRISM_INLINE bool
 pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
     if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
         parser->cursor++;
@@ -86,7 +195,7 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
 /**
  * Expect a character to be present and consume it.
  */
-static inline bool
+static PRISM_INLINE bool
 pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
     if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
         parser->cursor++;
@@ -114,6 +223,47 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
 }
 
 /**
+ * Mark a group boundary in the hex escape byte buffer. When consecutive hex
+ * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00
+ * sentinel to separate the groups for later multibyte validation.
+ */
+static PRISM_INLINE void
+pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) {
+    if (parser->hex_group_active) {
+        pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00);
+        parser->hex_group_active = false;
+    }
+}
+
+/**
+ * Track a hex escape byte value >= 0x80 for multibyte validation.
+ */
+static PRISM_INLINE void
+pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) {
+    if (byte >= 0x80) {
+        pm_buffer_append_byte(&parser->hex_escape_buffer, byte);
+        parser->hex_group_active = true;
+        parser->has_hex_escape = true;
+
+        parser->explicit_encoding = parser->encoding;
+        parser->last_escape_was_unicode = false;
+    } else {
+        pm_regexp_hex_group_boundary(parser);
+    }
+}
+
+/**
+ * Parse a hex digit character and return its value, or -1 if not a hex digit.
+ */
+static PRISM_INLINE int
+pm_regexp_hex_digit_value(uint8_t byte) {
+    if (byte >= '0' && byte <= '9') return byte - '0';
+    if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10;
+    if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10;
+    return -1;
+}
+
+/**
  * Range quantifiers are a special class of quantifiers that look like
  *
  * * {digit}
@@ -121,13 +271,12 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
  * * {digit,digit}
  * * {,digit}
  *
- * Unfortunately, if there are any spaces in between, then this just becomes a
- * regular character match expression and we have to backtrack. So when this
- * function first starts running, we'll create a "save" point and then attempt
- * to parse the quantifier. If it fails, we'll restore the save point and
- * return.
+ * If there are any spaces in between, then this just becomes a regular
+ * character match expression and we have to backtrack. So when this function
+ * first starts running, we'll create a "save" point and then attempt to parse
+ * the quantifier. If it fails, we'll restore the save point and return.
  *
- * The properly track everything, we're going to build a little state machine.
+ * To properly track everything, we're going to build a little state machine.
  * It looks something like the following:
  *
  *                  +-------+                 +---------+ ------------+
@@ -275,11 +424,393 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
     );
 }
 
+/**
+ * Property escape classification. Onigmo supports three tiers of property
+ * names depending on the encoding:
+ *
+ * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower,
+ *   Print, Punct, Space, Upper, XDigit, Word): valid in all encodings.
+ * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid
+ *   in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n).
+ * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus
+ *   Any, Assigned): valid only in UTF-8 (/u).
+ */
+typedef enum {
+    PM_REGEXP_PROPERTY_POSIX,
+    PM_REGEXP_PROPERTY_SCRIPT,
+    PM_REGEXP_PROPERTY_UNICODE
+} pm_regexp_property_type_t;
+
+/**
+ * Classify a property name. The name may start with '^' for negation, which
+ * is skipped before matching.
+ */
+static pm_regexp_property_type_t
+pm_regexp_classify_property(const uint8_t *name, size_t length) {
+    // Skip leading '^' for negated properties like \p{^Hiragana}.
+    if (length > 0 && name[0] == '^') {
+        name++;
+        length--;
+    }
+
+#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0)
+
+    switch (length) {
+        case 3:
+            if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT;
+            break;
+        case 4:
+            if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX;
+            break;
+        case 5:
+            /* Most properties are length 5, so dispatch on first character. */
+            switch (name[0] | 0x20) {
+                case 'a':
+                    if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX;
+                    if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX;
+                    if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 'b':
+                    if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 'c':
+                    if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 'd':
+                    if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 'g':
+                    if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX;
+                    if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT;
+                    break;
+                case 'l':
+                    if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX;
+                    if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT;
+                    break;
+                case 'p':
+                    if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX;
+                    if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 's':
+                    if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+                case 'u':
+                    if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX;
+                    break;
+            }
+            break;
+        case 6:
+            if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX;
+            break;
+        case 8:
+            if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT;
+            if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT;
+            if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT;
+            break;
+    }
+
+#undef PM_REGEXP_CASECMP
+
+    // Everything else is Unicode-only (general categories, other scripts, etc.).
+    return PM_REGEXP_PROPERTY_UNICODE;
+}
+
+/**
+ * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The
+ * cursor should be pointing at 'p' or 'P' when this is called. If a property
+ * escape is found, record it on the regexp parser and advance past the closing
+ * '}'.
+ *
+ * Properties are classified into three tiers (POSIX, script, Unicode-only) to
+ * determine which encoding modifiers they are valid with.
+ */
+static bool
+pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) {
+    assert(*parser->cursor == 'p' || *parser->cursor == 'P');
+
+    if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') {
+        const uint8_t *name_start = parser->cursor + 2;
+        const uint8_t *search = name_start;
+
+        while (search < parser->end && *search != '}') search++;
+
+        if (search < parser->end) {
+            size_t name_length = (size_t) (search - name_start);
+            parser->has_property_escape = true;
+
+            pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length);
+
+            // Track the first non-POSIX property name (for /n error messages).
+            if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) {
+                parser->property_name = name_start;
+                parser->property_name_length = name_length;
+            }
+
+            // Track the first Unicode-only property name (for /e, /s error messages).
+            if (type == PM_REGEXP_PROPERTY_UNICODE) {
+                parser->has_unicode_property_escape = true;
+                if (parser->unicode_property_name == NULL) {
+                    parser->unicode_property_name = name_start;
+                    parser->unicode_property_name_length = name_length;
+                }
+            }
+
+            parser->cursor = search + 1; // skip past '}'
+            return true;
+        }
+    }
+
+    // Not a property escape, just skip the single character after '\'.
+    parser->cursor++;
+    return false;
+}
+
+/**
+ * Validate and skip a \u escape sequence in a regular expression. The cursor
+ * should be pointing at the character after 'u' when this is called. This
+ * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding
+ * state for validation.
+ */
+static void
+pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) {
+    const uint8_t *escape_start = parser->cursor - 2; // points to '\'
+
+    if (pm_regexp_char_is_eof(parser)) {
+        pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+        return;
+    }
+
+    if (*parser->cursor == '{') {
+        parser->cursor++; // skip '{'
+
+        // Skip leading whitespace.
+        while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+            parser->cursor++;
+        }
+
+        bool has_codepoint = false;
+
+        while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+            // Parse the hex digits to compute the codepoint value.
+            uint32_t value = 0;
+            size_t hex_count = 0;
+
+            int digit;
+            while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+                value = (value << 4) | (uint32_t) digit;
+                hex_count++;
+                parser->cursor++;
+            }
+
+            if (hex_count == 0) {
+                // Skip to '}' or end of regexp to find the full extent.
+                while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+                    parser->cursor++;
+                }
+
+                const uint8_t *escape_end = parser->cursor;
+                if (!pm_regexp_char_is_eof(parser)) {
+                    escape_end++;
+                    parser->cursor++; // skip '}'
+                }
+
+                pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start);
+                return;
+            }
+
+            if (hex_count > 6) {
+                pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range");
+            }
+
+            // Track encoding state for this codepoint.
+            if (value >= 0x80) {
+                parser->has_unicode_escape = true;
+                parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+                parser->last_escape_was_unicode = true;
+                pm_regexp_hex_group_boundary(parser);
+            }
+
+            // Check for invalid Unicode range (surrogates or > 0x10FFFF).
+            if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
+                parser->invalid_unicode_range = true;
+            }
+
+            has_codepoint = true;
+
+            // Skip whitespace between codepoints.
+            while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+                parser->cursor++;
+            }
+        }
+
+        if (pm_regexp_char_is_eof(parser)) {
+            pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape");
+        } else {
+            if (!has_codepoint) {
+                pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start);
+            }
+            parser->cursor++; // skip '}'
+        }
+    } else {
+        // \uNNNN form — need exactly 4 hex digits.
+        uint32_t value = 0;
+        size_t hex_count = 0;
+
+        int digit;
+        while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+            value = (value << 4) | (uint32_t) digit;
+            hex_count++;
+            parser->cursor++;
+        }
+
+        if (hex_count < 4) {
+            pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+        } else if (value >= 0x80) {
+            parser->has_unicode_escape = true;
+            parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+            parser->last_escape_was_unicode = true;
+            pm_regexp_hex_group_boundary(parser);
+        }
+
+        // Check for invalid Unicode range.
+        if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) {
+            parser->invalid_unicode_range = true;
+        }
+    }
+}
+
 // Forward declaration because character sets can be nested.
 static bool
 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
 
 /**
+ * Parse a \x escape and return the byte value. The cursor should be pointing
+ * at the character after 'x'. Returns -1 if no hex digits follow.
+ */
+static int
+pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) {
+    int value = -1;
+
+    if (!pm_regexp_char_is_eof(parser)) {
+        int digit = pm_regexp_hex_digit_value(*parser->cursor);
+        if (digit >= 0) {
+            value = digit;
+            parser->cursor++;
+
+            if (!pm_regexp_char_is_eof(parser)) {
+                digit = pm_regexp_hex_digit_value(*parser->cursor);
+                if (digit >= 0) {
+                    value = (value << 4) | digit;
+                    parser->cursor++;
+                }
+            }
+        }
+    }
+
+    if (value >= 0) {
+        pm_regexp_track_hex_escape(parser, (uint8_t) value);
+    }
+
+    return value;
+}
+
+/**
+ * Parse a backslash escape sequence in a regexp, handling \u (unicode),
+ * \p/\P (property), \x (hex), and other single-character escapes. Also
+ * tracks encoding state for \M-x and \C-\M-x escapes.
+ */
+static void
+pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) {
+    if (pm_regexp_char_is_eof(parser)) return;
+
+    switch (*parser->cursor) {
+        case 'u':
+            parser->cursor++; // skip 'u'
+            pm_regexp_parse_unicode_escape(parser);
+            break;
+        case 'p':
+        case 'P':
+            pm_regexp_parse_property_escape(parser);
+            break;
+        case 'x':
+            parser->cursor++; // skip 'x'
+            pm_regexp_parse_hex_escape(parser);
+            break;
+        case 'M':
+            // \M-x produces (x | 0x80), always >= 0x80
+            if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+                parser->cursor += 2; // skip 'M-'
+                if (!pm_regexp_char_is_eof(parser)) {
+                    if (*parser->cursor == '\\') {
+                        parser->cursor++;
+                        // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80
+                        // We just need to track it as a hex escape >= 0x80.
+                        pm_regexp_parse_backslash_escape(parser);
+                    } else {
+                        parser->cursor++;
+                    }
+                    // \M-x always produces a byte >= 0x80
+                    pm_regexp_track_hex_escape(parser, 0x80);
+                }
+            } else {
+                parser->cursor++;
+            }
+            break;
+        case 'C':
+            // \C-x produces (x & 0x1F)
+            if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+                parser->cursor += 2; // skip 'C-'
+                if (!pm_regexp_char_is_eof(parser)) {
+                    if (*parser->cursor == '\\') {
+                        parser->cursor++;
+                        pm_regexp_parse_backslash_escape(parser);
+                    } else {
+                        parser->cursor++;
+                    }
+                }
+            } else {
+                parser->cursor++;
+            }
+            break;
+        case 'c':
+            // \cx produces (x & 0x1F)
+            parser->cursor++; // skip 'c'
+            if (!pm_regexp_char_is_eof(parser)) {
+                if (*parser->cursor == '\\') {
+                    parser->cursor++;
+                    pm_regexp_parse_backslash_escape(parser);
+                } else {
+                    parser->cursor++;
+                }
+            }
+            break;
+        default:
+            pm_regexp_hex_group_boundary(parser);
+            parser->cursor++;
+            break;
+    }
+}
+
+/**
+ * Check if a byte at the current position is a non-ASCII byte in a multibyte
+ * encoding that produces an invalid character. If so, emit an error at the
+ * byte location immediately.
+ */
+static void
+pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) {
+    uint8_t byte = *cursor;
+    if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) {
+        size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor));
+        if (width > 1) {
+            parser->cursor += width - 1;
+        } else if (width == 0) {
+            parser->has_invalid_multibyte = true;
+            pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+        }
+    }
+}
+
+/**
  * match-char-set : '[' '^'? (match-range | match-char)* ']'
  *                ;
  */
@@ -293,12 +824,16 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
                 pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
                 break;
             case '\\':
-                if (!pm_regexp_char_is_eof(parser)) {
-                    parser->cursor++;
-                }
+                pm_regexp_parse_backslash_escape(parser);
                 break;
             default:
-                // do nothing, we've already advanced the cursor
+                // We've already advanced the cursor by one byte. If the byte
+                // was >= 0x80 in a multibyte encoding, we may need to consume
+                // additional continuation bytes and validate the character.
+                if (*(parser->cursor - 1) >= 0x80) {
+                    parser->non_ascii_literal_count++;
+                }
+                pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1);
                 break;
         }
     }
@@ -354,8 +889,13 @@ typedef enum {
 // These are the options that are configurable on the regular expression (or
 // from within a group).
 
+/** The minimum character value for a regexp option slot. */
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
+
+/** The maximum character value for a regexp option slot. */
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
+
+/** The number of regexp option slots. */
 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
 
 /**
@@ -498,7 +1038,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
                         }
 
                         size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
-                        if (width == 0) return false;
+                        if (width == 0) {
+                            if (*parser->cursor >= 0x80) {
+                                parser->has_invalid_multibyte = true;
+                                pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+                                parser->cursor++;
+                                continue;
+                            }
+                            return false;
+                        }
 
                         escaped = (width == 1) && (*parser->cursor == '\\');
                         parser->cursor += width;
@@ -686,9 +1234,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
             return pm_regexp_parse_quantifier(parser);
         case '\\':
             parser->cursor++;
-            if (!pm_regexp_char_is_eof(parser)) {
-                parser->cursor++;
-            }
+            pm_regexp_parse_backslash_escape(parser);
             return pm_regexp_parse_quantifier(parser);
         case '(':
             parser->cursor++;
@@ -720,9 +1266,30 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
                 width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
             }
 
-            if (width == 0) return false; // TODO: add appropriate error
-            parser->cursor += width;
+            if (width == 0) {
+                if (*parser->cursor >= 0x80 && parser->encoding_changed) {
+                    if (parser->encoding->multibyte) {
+                        // Invalid multibyte character in a multibyte encoding.
+                        // Emit the error at the byte location immediately.
+                        parser->has_invalid_multibyte = true;
+                        pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+                    } else {
+                        // Non-ASCII byte in a single-byte encoding (e.g.,
+                        // US-ASCII). Count it for later error reporting.
+                        parser->non_ascii_literal_count++;
+                    }
+                    parser->cursor++;
+                    return pm_regexp_parse_quantifier(parser);
+                }
+                return false;
+            }
+
+            // Count non-ASCII literal bytes.
+            for (size_t i = 0; i < width; i++) {
+                if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++;
+            }
 
+            parser->cursor += width;
             return pm_regexp_parse_quantifier(parser);
         }
     }
@@ -768,13 +1335,354 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
     return pm_regexp_char_is_eof(parser);
 }
 
+// ---------------------------------------------------------------------------
+// Encoding validation
+// ---------------------------------------------------------------------------
+
 /**
- * Parse a regular expression and extract the names of all of the named capture
- * groups.
+ * Validate that groups of hex escape bytes in the buffer form valid multibyte
+ * characters in the given encoding. Groups are separated by 0x00 sentinels.
+ */
+static bool
+pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) {
+    const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer);
+    size_t len = pm_buffer_length(buffer);
+    size_t i = 0;
+
+    while (i < len) {
+        size_t group_start = i;
+        while (i < len && data[i] != 0x00) i++;
+
+        for (size_t j = group_start; j < i; ) {
+            size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j));
+            if (width == 0) return false;
+            j += width;
+        }
+
+        if (i < len) i++; // skip sentinel
+    }
+
+    return true;
+}
+
+/**
+ * Format regexp source content for use in error messages, hex-escaping
+ * non-ASCII bytes.
+ */
+static void
+pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) {
+    size_t index = 0;
+
+    if (encoding == PM_ENCODING_UTF_8_ENTRY) {
+        pm_buffer_append_string(buffer, (const char *) source, length);
+        return;
+    }
+
+    while (index < length) {
+        if (source[index] < 0x80) {
+            pm_buffer_append_byte(buffer, source[index]);
+            index++;
+        } else if (encoding->multibyte) {
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index));
+
+            if (width > 1) {
+                pm_buffer_append_string(buffer, "\\x{", 3);
+                for (size_t i = 0; i < width; i++) {
+                    pm_buffer_append_format(buffer, "%02X", source[index + i]);
+                }
+                pm_buffer_append_byte(buffer, '}');
+                index += width;
+            } else {
+                pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+                index++;
+            }
+        } else {
+            pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+            index++;
+        }
+    }
+}
+
+/**
+ * Emit an encoding validation error on the regexp node.
+ */
+#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \
+    pm_diagnostic_list_append_format( \
+        &(parser)->parser->metadata_arena, \
+        &(parser)->parser->error_list, \
+        (uint32_t) ((parser)->node_start - (parser)->parser->start), \
+        (uint32_t) ((parser)->node_end - (parser)->node_start), \
+        diag_id, __VA_ARGS__)
+
+/**
+ * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n).
+ *
+ * The decision tree is:
+ *
+ * 1. No escape-set encoding (explicit_encoding == NULL):
+ *    a. ASCII-only content: validate property escapes, return forced US-ASCII
+ *       for /n or the modifier flags for others.
+ *    b. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ *    c. Source encoding differs from modifier encoding: emit mismatch error.
+ *
+ * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending
+ *    on the modifier and which escape type was last.
+ *
+ * 3. \u escape with non-/u modifier: incompatible encoding error.
+ *
+ * 4. Validate that hex escape byte sequences form valid multibyte characters
+ *    in the modifier's encoding.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) {
+
+    if (parser->explicit_encoding == NULL) {
+        if (ascii_only) {
+            // Check property escapes against the modifier's encoding tier.
+            // /n (ASCII-8BIT): only POSIX properties are valid.
+            // /e, /s: POSIX and script properties are valid.
+            // /u: all properties are valid.
+            if (modifier == 'n' && parser->property_name != NULL) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+                    (int) parser->property_name_length, (const char *) parser->property_name,
+                    source_length, source_start);
+            } else if (modifier != 'u' && parser->has_unicode_property_escape) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+                    (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name,
+                    source_length, source_start);
+            }
+            return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
+        }
+
+        if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+            }
+        } else if (parser->encoding != modifier_encoding) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
+
+            if (modifier == 'n' && !ascii_only) {
+                pm_buffer_t formatted = { 0 };
+                pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length);
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value);
+                pm_buffer_cleanup(&formatted);
+            }
+        }
+
+        return flags;
+    }
+
+    // Mixed unicode + hex escapes.
+    if (parser->has_unicode_escape && parser->has_hex_escape) {
+        if (modifier == 'n') {
+            if (parser->last_escape_was_unicode) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+            } else {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+            }
+        } else {
+            if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+        }
+
+        return flags;
+    }
+
+    if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+        if (parser->last_escape_was_unicode) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+        } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+        }
+    }
+
+    if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+        PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+    }
+
+    return flags;
+}
+
+/**
+ * Validate encoding for a regexp without a modifier and compute the encoding
+ * flags to set on the node.
+ *
+ * The decision tree is:
+ *
+ * 1. If a modifier (/n, /u, /e, /s) is present, delegate to
+ *    pm_regexp_validate_encoding_modifier.
+ * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors
+ *    were already emitted during parsing).
+ * 3. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}).
+ * 5. Escape-set encoding present: validate hex escapes against the target
+ *    encoding, handle mixed \u + \x conflicts, and return the appropriate
+ *    forced encoding flag.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) {
+
+    // Invalid multibyte characters suppress further validation.
+    // Errors were already emitted at the byte locations during parsing.
+    if (parser->has_invalid_multibyte) {
+        return flags;
+    }
+
+    if (parser->invalid_unicode_range) {
+        PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start);
+        return flags;
+    }
+
+    // Check modifier flags first.
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length);
+    }
+    if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+        return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length);
+    }
+
+    // No modifier — check for non-ASCII literals in US-ASCII encoding.
+    if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
+        for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+            PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+        }
+    }
+
+    // ASCII-only regexps get downgraded to US-ASCII, unless property escapes
+    // force UTF-8.
+    if (ascii_only) {
+        if (parser->has_property_escape) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+        }
+        return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
+    }
+
+    // Check explicit encoding from escape sequences.
+    if (parser->explicit_encoding != NULL) {
+        // Mixed unicode + hex escapes without modifier.
+        if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+            if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY &&
+                parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY &&
+                !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            } else if (parser->last_escape_was_unicode) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+            } else {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+            }
+
+            return 0;
+        }
+
+        if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+        } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+            return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
+        } else {
+            if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+                PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+            }
+        }
+    }
+
+    return 0;
+}
+
+/**
+ * Parse a regular expression, validate its encoding, and optionally extract
+ * named capture groups. Encoding validation walks the raw source (content_loc)
+ * to distinguish escape-produced bytes from literal bytes. Named capture
+ * extraction walks the unescaped content since escape sequences in group names
+ * (e.g., line continuations) have already been processed by the lexer.
+ */
+pm_node_flags_t
+pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+    const uint8_t *source = parser->start + node->content_loc.start;
+    size_t size = node->content_loc.length;
+    bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED);
+    pm_node_flags_t flags = PM_NODE_FLAGS(node);
+
+    const uint8_t *node_start = parser->start + node->base.location.start;
+    const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length;
+
+    // First pass: walk raw source for encoding validation (no name extraction).
+    pm_regexp_parser_t regexp_parser = {
+        .parser = parser,
+        .start = source,
+        .cursor = source,
+        .end = source + size,
+        .extended_mode = extended_mode,
+        .encoding_changed = parser->encoding_changed,
+        .encoding = parser->encoding,
+        .name_callback = NULL,
+        .name_data = NULL,
+        .shared = true,
+        .node_start = node_start,
+        .node_end = node_end,
+        .has_unicode_escape = false,
+        .has_hex_escape = false,
+        .last_escape_was_unicode = false,
+        .explicit_encoding = NULL,
+        .has_property_escape = false,
+        .has_unicode_property_escape = false,
+        .property_name = NULL,
+        .property_name_length = 0,
+        .unicode_property_name = NULL,
+        .unicode_property_name_length = 0,
+        .non_ascii_literal_count = 0,
+        .invalid_unicode_range = false,
+        .hex_escape_buffer = { 0 },
+        .hex_group_active = false,
+        .has_invalid_multibyte = false,
+    };
+
+    pm_regexp_parse_pattern(&regexp_parser);
+
+    // Compute ascii_only from the regexp parser's tracked state. We cannot
+    // use node->unescaped for this because regexp unescaped content preserves
+    // escape text (e.g., \x80 is 4 ASCII chars), not the binary values.
+    bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0;
+    // Use the unescaped content for error messages to match CRuby's format,
+    // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes
+    // like \u{80} are preserved as text.
+    const char *error_source = (const char *) pm_string_source(&node->unescaped);
+    int error_source_length = (int) pm_string_length(&node->unescaped);
+    pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(&regexp_parser, ascii_only, flags, error_source, error_source_length);
+    pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
+
+    // Second pass: walk unescaped content for named capture extraction.
+    if (name_callback != NULL) {
+        bool shared = node->unescaped.type == PM_STRING_SHARED;
+        pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data);
+    }
+
+    return encoding_flags;
+}
+
+/**
+ * Parse an interpolated regular expression for named capture groups only.
+ * This is used for the =~ operator with interpolated regexps where we don't
+ * have a pm_regular_expression_node_t. No encoding validation is performed.
+ *
+ * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.)
+ * are initialized but not used for the result. They exist because the parsing
+ * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update
+ * them as they walk through the content.
  */
-PRISM_EXPORTED_FUNCTION void
-pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) {
-    pm_regexp_parse_pattern(&(pm_regexp_parser_t) {
+void
+pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+    pm_regexp_parser_t regexp_parser = {
         .parser = parser,
         .start = source,
         .cursor = source,
@@ -784,7 +1692,26 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool ex
         .encoding = parser->encoding,
         .name_callback = name_callback,
         .name_data = name_data,
-        .error_callback = error_callback,
-        .error_data = error_data
-    });
+        .shared = shared,
+        .node_start = source,
+        .node_end = source + size,
+        .has_unicode_escape = false,
+        .has_hex_escape = false,
+        .last_escape_was_unicode = false,
+        .explicit_encoding = NULL,
+        .has_property_escape = false,
+        .has_unicode_property_escape = false,
+        .property_name = NULL,
+        .property_name_length = 0,
+        .unicode_property_name = NULL,
+        .unicode_property_name_length = 0,
+        .non_ascii_literal_count = 0,
+        .invalid_unicode_range = false,
+        .hex_escape_buffer = { 0 },
+        .hex_group_active = false,
+        .has_invalid_multibyte = false,
+    };
+
+    pm_regexp_parse_pattern(&regexp_parser);
+    pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
 }