summaryrefslogtreecommitdiff
path: root/prism/regexp.c
diff options
context:
space:
mode:
Diffstat (limited to 'prism/regexp.c')
-rw-r--r--prism/regexp.c1717
1 files changed, 1717 insertions, 0 deletions
diff --git a/prism/regexp.c b/prism/regexp.c
new file mode 100644
index 0000000000..cc17aa4d09
--- /dev/null
+++ b/prism/regexp.c
@@ -0,0 +1,1717 @@
+#include "prism/internal/regexp.h"
+
+#include "prism/compiler/inline.h"
+#include "prism/compiler/fallthrough.h"
+#include "prism/internal/buffer.h"
+#include "prism/internal/char.h"
+#include "prism/internal/diagnostic.h"
+#include "prism/internal/encoding.h"
+#include "prism/internal/memchr.h"
+#include "prism/internal/parser.h"
+#include "prism/internal/stringy.h"
+#include "prism/internal/strncasecmp.h"
+
+#include <assert.h>
+#include <string.h>
+
+/** The maximum depth of nested groups allowed in a regular expression. */
+#define PM_REGEXP_PARSE_DEPTH_MAX 4096
+
+/**
+ * This is the parser that is going to handle parsing regular expressions.
+ */
+typedef struct {
+ /** The parser that is currently being used. */
+ pm_parser_t *parser;
+
+ /** A pointer to the start of the source that we are parsing. */
+ const uint8_t *start;
+
+ /** A pointer to the current position in the source. */
+ const uint8_t *cursor;
+
+ /** A pointer to the end of the source that we are parsing. */
+ const uint8_t *end;
+
+ /** The encoding of the source. */
+ const pm_encoding_t *encoding;
+
+ /** The callback to call when a named capture group is found. */
+ pm_regexp_name_callback_t name_callback;
+
+ /** The data to pass to the name callback. */
+ pm_regexp_name_data_t *name_data;
+
+ /** The start of the regexp node (for error locations). */
+ const uint8_t *node_start;
+
+ /** The end of the regexp node (for error locations). */
+ const uint8_t *node_end;
+
+ /**
+ * The explicit encoding determined by escape sequences. NULL if no
+ * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the
+ * source encoding for `\x` escapes.
+ */
+ const pm_encoding_t *explicit_encoding;
+
+ /**
+ * Pointer to the first non-POSIX property name (for /n error messages).
+ * POSIX properties (Alnum, Alpha, etc.) work in all encodings.
+ * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u.
+ * Unicode-only properties (L, Ll, etc.) work only in /u.
+ */
+ const uint8_t *property_name;
+
+ /** Length of the first non-POSIX property name found. */
+ size_t property_name_length;
+
+ /**
+ * Pointer to the first Unicode-only property name (for /e, /s error
+ * messages). NULL if only POSIX or script properties have been seen.
+ */
+ const uint8_t *unicode_property_name;
+
+ /** Length of the first Unicode-only property name found. */
+ size_t unicode_property_name_length;
+
+ /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */
+ pm_buffer_t hex_escape_buffer;
+
+ /** Count of non-ASCII literal bytes (not from escapes). */
+ uint32_t non_ascii_literal_count;
+
+ /**
+ * Whether or not the regular expression currently being parsed is in
+ * extended mode, wherein whitespace is ignored and comments are allowed.
+ */
+ bool extended_mode;
+
+ /** Whether the encoding has changed from the default. */
+ bool encoding_changed;
+
+ /** Whether the source content is shared (for named capture callback). */
+ bool shared;
+
+ /** Whether a `\u{...}` escape with value >= 0x80 was seen. */
+ bool has_unicode_escape;
+
+ /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */
+ bool has_hex_escape;
+
+ /**
+ * Tracks whether the last encoding-setting escape was `\u` (true) or `\x`
+ * (false). This matters for error messages when both types are mixed.
+ */
+ bool last_escape_was_unicode;
+
+ /** Whether any `\p{...}` or `\P{...}` property escape was found. */
+ bool has_property_escape;
+
+ /** Whether a Unicode-only property escape was found (not POSIX or script). */
+ bool has_unicode_property_escape;
+
+ /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */
+ bool invalid_unicode_range;
+
+ /** Whether we are accumulating consecutive hex escape bytes. */
+ bool hex_group_active;
+
+ /** Whether an invalid multibyte character was found during parsing. */
+ bool has_invalid_multibyte;
+} pm_regexp_parser_t;
+
+/**
+ * Append a syntax error to the parser's error list. If the source is shared
+ * (points into the original source), we can point to the exact error location.
+ * Otherwise, we point to the whole regexp node.
+ */
+static PRISM_INLINE void
+pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) {
+ pm_parser_t *pm = parser->parser;
+ uint32_t loc_start, loc_length;
+
+ if (parser->shared) {
+ loc_start = (uint32_t) (start - pm->start);
+ loc_length = (uint32_t) (end - start);
+ } else {
+ loc_start = (uint32_t) (parser->node_start - pm->start);
+ loc_length = (uint32_t) (parser->node_end - parser->node_start);
+ }
+
+ pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message);
+}
+
+/**
+ * Append a formatted diagnostic error with proper shared/non-shared location
+ * handling. This is a macro because we need variadic args for the format string.
+ */
+#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \
+ do { \
+ pm_parser_t *pm__ = (parser_)->parser; \
+ uint32_t loc_start__, loc_length__; \
+ if ((parser_)->shared) { \
+ loc_start__ = (uint32_t) ((err_start_) - pm__->start); \
+ loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \
+ } else { \
+ loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \
+ loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \
+ } \
+ pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \
+ } while (0)
+
+/**
+ * This appends a new string to the list of named captures. This function
+ * assumes the caller has already checked the validity of the name callback.
+ */
+static void
+pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+ pm_string_t string;
+ pm_string_shared_init(&string, start, end);
+ parser->name_callback(parser->parser, &string, parser->shared, parser->name_data);
+ pm_string_cleanup(&string);
+}
+
+/**
+ * Returns true if the next character is the end of the source.
+ */
+static PRISM_INLINE bool
+pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
+ return parser->cursor >= parser->end;
+}
+
+/**
+ * Optionally accept a char and consume it if it exists.
+ */
+static PRISM_INLINE bool
+pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
+ parser->cursor++;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * Expect a character to be present and consume it.
+ */
+static PRISM_INLINE bool
+pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
+ if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
+ parser->cursor++;
+ return true;
+ }
+ return false;
+}
+
+/**
+ * This advances the current token to the next instance of the given character.
+ */
+static bool
+pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
+ if (pm_regexp_char_is_eof(parser)) {
+ return false;
+ }
+
+ const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
+ if (end == NULL) {
+ return false;
+ }
+
+ parser->cursor = end + 1;
+ return true;
+}
+
+/**
+ * Mark a group boundary in the hex escape byte buffer. When consecutive hex
+ * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00
+ * sentinel to separate the groups for later multibyte validation.
+ */
+static PRISM_INLINE void
+pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) {
+ if (parser->hex_group_active) {
+ pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00);
+ parser->hex_group_active = false;
+ }
+}
+
+/**
+ * Track a hex escape byte value >= 0x80 for multibyte validation.
+ */
+static PRISM_INLINE void
+pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) {
+ if (byte >= 0x80) {
+ pm_buffer_append_byte(&parser->hex_escape_buffer, byte);
+ parser->hex_group_active = true;
+ parser->has_hex_escape = true;
+
+ parser->explicit_encoding = parser->encoding;
+ parser->last_escape_was_unicode = false;
+ } else {
+ pm_regexp_hex_group_boundary(parser);
+ }
+}
+
+/**
+ * Parse a hex digit character and return its value, or -1 if not a hex digit.
+ */
+static PRISM_INLINE int
+pm_regexp_hex_digit_value(uint8_t byte) {
+ if (byte >= '0' && byte <= '9') return byte - '0';
+ if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10;
+ if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10;
+ return -1;
+}
+
+/**
+ * Range quantifiers are a special class of quantifiers that look like
+ *
+ * * {digit}
+ * * {digit,}
+ * * {digit,digit}
+ * * {,digit}
+ *
+ * If there are any spaces in between, then this just becomes a regular
+ * character match expression and we have to backtrack. So when this function
+ * first starts running, we'll create a "save" point and then attempt to parse
+ * the quantifier. If it fails, we'll restore the save point and return.
+ *
+ * To properly track everything, we're going to build a little state machine.
+ * It looks something like the following:
+ *
+ * +-------+ +---------+ ------------+
+ * ---- lbrace ---> | start | ---- digit ---> | minimum | |
+ * +-------+ +---------+ <--- digit -+
+ * | | |
+ * +-------+ | | rbrace
+ * | comma | <----- comma +---- comma -------+ |
+ * +-------+ V V
+ * | +---------+ +---------+
+ * +-- digit --> | maximum | -- rbrace --> || final ||
+ * +---------+ +---------+
+ * | ^
+ * +- digit -+
+ *
+ * Note that by the time we've hit this function, the lbrace has already been
+ * consumed so we're in the start state.
+ */
+static bool
+pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
+ const uint8_t *savepoint = parser->cursor;
+
+ enum {
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
+ PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
+ } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
+
+ while (1) {
+ if (parser->cursor >= parser->end) {
+ parser->cursor = savepoint;
+ return true;
+ }
+
+ switch (state) {
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
+ break;
+ case ',':
+ parser->cursor++;
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
+ break;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ break;
+ case ',':
+ parser->cursor++;
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
+ break;
+ case '}':
+ parser->cursor++;
+ return true;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
+ break;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
+ switch (*parser->cursor) {
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
+ parser->cursor++;
+ break;
+ case '}':
+ parser->cursor++;
+ return true;
+ default:
+ parser->cursor = savepoint;
+ return true;
+ }
+ break;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * quantifier : star-quantifier
+ * | plus-quantifier
+ * | optional-quantifier
+ * | range-quantifier
+ * | <empty>
+ * ;
+ */
+static bool
+pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
+ while (!pm_regexp_char_is_eof(parser)) {
+ switch (*parser->cursor) {
+ case '*':
+ case '+':
+ case '?':
+ parser->cursor++;
+ break;
+ case '{':
+ parser->cursor++;
+ if (!pm_regexp_parse_range_quantifier(parser)) return false;
+ break;
+ default:
+ // In this case there is no quantifier.
+ return true;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
+ * ;
+ */
+static bool
+pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
+ if (!pm_regexp_char_expect(parser, ':')) {
+ return false;
+ }
+
+ pm_regexp_char_accept(parser, '^');
+
+ return (
+ pm_regexp_char_find(parser, ':') &&
+ pm_regexp_char_expect(parser, ']') &&
+ pm_regexp_char_expect(parser, ']')
+ );
+}
+
+/**
+ * Property escape classification. Onigmo supports three tiers of property
+ * names depending on the encoding:
+ *
+ * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower,
+ * Print, Punct, Space, Upper, XDigit, Word): valid in all encodings.
+ * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid
+ * in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n).
+ * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus
+ * Any, Assigned): valid only in UTF-8 (/u).
+ */
+typedef enum {
+ PM_REGEXP_PROPERTY_POSIX,
+ PM_REGEXP_PROPERTY_SCRIPT,
+ PM_REGEXP_PROPERTY_UNICODE
+} pm_regexp_property_type_t;
+
+/**
+ * Classify a property name. The name may start with '^' for negation, which
+ * is skipped before matching.
+ */
+static pm_regexp_property_type_t
+pm_regexp_classify_property(const uint8_t *name, size_t length) {
+ // Skip leading '^' for negated properties like \p{^Hiragana}.
+ if (length > 0 && name[0] == '^') {
+ name++;
+ length--;
+ }
+
+#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0)
+
+ switch (length) {
+ case 3:
+ if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT;
+ break;
+ case 4:
+ if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 5:
+ /* Most properties are length 5, so dispatch on first character. */
+ switch (name[0] | 0x20) {
+ case 'a':
+ if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX;
+ if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX;
+ if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 'b':
+ if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 'c':
+ if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 'd':
+ if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 'g':
+ if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX;
+ if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT;
+ break;
+ case 'l':
+ if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX;
+ if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT;
+ break;
+ case 'p':
+ if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX;
+ if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 's':
+ if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 'u':
+ if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ }
+ break;
+ case 6:
+ if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX;
+ break;
+ case 8:
+ if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT;
+ if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT;
+ if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT;
+ break;
+ }
+
+#undef PM_REGEXP_CASECMP
+
+ // Everything else is Unicode-only (general categories, other scripts, etc.).
+ return PM_REGEXP_PROPERTY_UNICODE;
+}
+
+/**
+ * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The
+ * cursor should be pointing at 'p' or 'P' when this is called. If a property
+ * escape is found, record it on the regexp parser and advance past the closing
+ * '}'.
+ *
+ * Properties are classified into three tiers (POSIX, script, Unicode-only) to
+ * determine which encoding modifiers they are valid with.
+ */
+static bool
+pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) {
+ assert(*parser->cursor == 'p' || *parser->cursor == 'P');
+
+ if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') {
+ const uint8_t *name_start = parser->cursor + 2;
+ const uint8_t *search = name_start;
+
+ while (search < parser->end && *search != '}') search++;
+
+ if (search < parser->end) {
+ size_t name_length = (size_t) (search - name_start);
+ parser->has_property_escape = true;
+
+ pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length);
+
+ // Track the first non-POSIX property name (for /n error messages).
+ if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) {
+ parser->property_name = name_start;
+ parser->property_name_length = name_length;
+ }
+
+ // Track the first Unicode-only property name (for /e, /s error messages).
+ if (type == PM_REGEXP_PROPERTY_UNICODE) {
+ parser->has_unicode_property_escape = true;
+ if (parser->unicode_property_name == NULL) {
+ parser->unicode_property_name = name_start;
+ parser->unicode_property_name_length = name_length;
+ }
+ }
+
+ parser->cursor = search + 1; // skip past '}'
+ return true;
+ }
+ }
+
+ // Not a property escape, just skip the single character after '\'.
+ parser->cursor++;
+ return false;
+}
+
+/**
+ * Validate and skip a \u escape sequence in a regular expression. The cursor
+ * should be pointing at the character after 'u' when this is called. This
+ * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding
+ * state for validation.
+ */
+static void
+pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) {
+ const uint8_t *escape_start = parser->cursor - 2; // points to '\'
+
+ if (pm_regexp_char_is_eof(parser)) {
+ pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+ return;
+ }
+
+ if (*parser->cursor == '{') {
+ parser->cursor++; // skip '{'
+
+ // Skip leading whitespace.
+ while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+ parser->cursor++;
+ }
+
+ bool has_codepoint = false;
+
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+ // Parse the hex digits to compute the codepoint value.
+ uint32_t value = 0;
+ size_t hex_count = 0;
+
+ int digit;
+ while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+ value = (value << 4) | (uint32_t) digit;
+ hex_count++;
+ parser->cursor++;
+ }
+
+ if (hex_count == 0) {
+ // Skip to '}' or end of regexp to find the full extent.
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') {
+ parser->cursor++;
+ }
+
+ const uint8_t *escape_end = parser->cursor;
+ if (!pm_regexp_char_is_eof(parser)) {
+ escape_end++;
+ parser->cursor++; // skip '}'
+ }
+
+ pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start);
+ return;
+ }
+
+ if (hex_count > 6) {
+ pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range");
+ }
+
+ // Track encoding state for this codepoint.
+ if (value >= 0x80) {
+ parser->has_unicode_escape = true;
+ parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+ parser->last_escape_was_unicode = true;
+ pm_regexp_hex_group_boundary(parser);
+ }
+
+ // Check for invalid Unicode range (surrogates or > 0x10FFFF).
+ if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) {
+ parser->invalid_unicode_range = true;
+ }
+
+ has_codepoint = true;
+
+ // Skip whitespace between codepoints.
+ while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) {
+ parser->cursor++;
+ }
+ }
+
+ if (pm_regexp_char_is_eof(parser)) {
+ pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape");
+ } else {
+ if (!has_codepoint) {
+ pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start);
+ }
+ parser->cursor++; // skip '}'
+ }
+ } else {
+ // \uNNNN form — need exactly 4 hex digits.
+ uint32_t value = 0;
+ size_t hex_count = 0;
+
+ int digit;
+ while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) {
+ value = (value << 4) | (uint32_t) digit;
+ hex_count++;
+ parser->cursor++;
+ }
+
+ if (hex_count < 4) {
+ pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape");
+ } else if (value >= 0x80) {
+ parser->has_unicode_escape = true;
+ parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
+ parser->last_escape_was_unicode = true;
+ pm_regexp_hex_group_boundary(parser);
+ }
+
+ // Check for invalid Unicode range.
+ if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) {
+ parser->invalid_unicode_range = true;
+ }
+ }
+}
+
+// Forward declaration because character sets can be nested.
+static bool
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth);
+
+/**
+ * Parse a \x escape and return the byte value. The cursor should be pointing
+ * at the character after 'x'. Returns -1 if no hex digits follow.
+ */
+static int
+pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) {
+ int value = -1;
+
+ if (!pm_regexp_char_is_eof(parser)) {
+ int digit = pm_regexp_hex_digit_value(*parser->cursor);
+ if (digit >= 0) {
+ value = digit;
+ parser->cursor++;
+
+ if (!pm_regexp_char_is_eof(parser)) {
+ digit = pm_regexp_hex_digit_value(*parser->cursor);
+ if (digit >= 0) {
+ value = (value << 4) | digit;
+ parser->cursor++;
+ }
+ }
+ }
+ }
+
+ if (value >= 0) {
+ pm_regexp_track_hex_escape(parser, (uint8_t) value);
+ }
+
+ return value;
+}
+
+/**
+ * Parse a backslash escape sequence in a regexp, handling \u (unicode),
+ * \p/\P (property), \x (hex), and other single-character escapes. Also
+ * tracks encoding state for \M-x and \C-\M-x escapes.
+ */
+static void
+pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) {
+ if (pm_regexp_char_is_eof(parser)) return;
+
+ switch (*parser->cursor) {
+ case 'u':
+ parser->cursor++; // skip 'u'
+ pm_regexp_parse_unicode_escape(parser);
+ break;
+ case 'p':
+ case 'P':
+ pm_regexp_parse_property_escape(parser);
+ break;
+ case 'x':
+ parser->cursor++; // skip 'x'
+ pm_regexp_parse_hex_escape(parser);
+ break;
+ case 'M':
+ // \M-x produces (x | 0x80), always >= 0x80
+ if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+ parser->cursor += 2; // skip 'M-'
+ if (!pm_regexp_char_is_eof(parser)) {
+ if (*parser->cursor == '\\') {
+ parser->cursor++;
+ // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80
+ // We just need to track it as a hex escape >= 0x80.
+ pm_regexp_parse_backslash_escape(parser);
+ } else {
+ parser->cursor++;
+ }
+ // \M-x always produces a byte >= 0x80
+ pm_regexp_track_hex_escape(parser, 0x80);
+ }
+ } else {
+ parser->cursor++;
+ }
+ break;
+ case 'C':
+ // \C-x produces (x & 0x1F)
+ if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') {
+ parser->cursor += 2; // skip 'C-'
+ if (!pm_regexp_char_is_eof(parser)) {
+ if (*parser->cursor == '\\') {
+ parser->cursor++;
+ pm_regexp_parse_backslash_escape(parser);
+ } else {
+ parser->cursor++;
+ }
+ }
+ } else {
+ parser->cursor++;
+ }
+ break;
+ case 'c':
+ // \cx produces (x & 0x1F)
+ parser->cursor++; // skip 'c'
+ if (!pm_regexp_char_is_eof(parser)) {
+ if (*parser->cursor == '\\') {
+ parser->cursor++;
+ pm_regexp_parse_backslash_escape(parser);
+ } else {
+ parser->cursor++;
+ }
+ }
+ break;
+ default:
+ pm_regexp_hex_group_boundary(parser);
+ parser->cursor++;
+ break;
+ }
+}
+
+/**
+ * Check if a byte at the current position is a non-ASCII byte in a multibyte
+ * encoding that produces an invalid character. If so, emit an error at the
+ * byte location immediately.
+ */
+static void
+pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) {
+ uint8_t byte = *cursor;
+ if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) {
+ size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor));
+ if (width > 1) {
+ parser->cursor += width - 1;
+ } else if (width == 0) {
+ parser->has_invalid_multibyte = true;
+ pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+ }
+ }
+}
+
+/**
+ * match-char-set : '[' '^'? (match-range | match-char)* ']'
+ * ;
+ */
+static bool
+pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) {
+ pm_regexp_char_accept(parser, '^');
+
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
+ switch (*parser->cursor++) {
+ case '[':
+ pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1));
+ break;
+ case '\\':
+ pm_regexp_parse_backslash_escape(parser);
+ break;
+ default:
+ // We've already advanced the cursor by one byte. If the byte
+ // was >= 0x80 in a multibyte encoding, we may need to consume
+ // additional continuation bytes and validate the character.
+ if (*(parser->cursor - 1) >= 0x80) {
+ parser->non_ascii_literal_count++;
+ }
+ pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1);
+ break;
+ }
+ }
+
+ return pm_regexp_char_expect(parser, ']');
+}
+
+/**
+ * A left bracket can either mean a POSIX class or a character set.
+ */
+static bool
+pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) {
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+ return false;
+ }
+
+ if ((parser->cursor < parser->end) && parser->cursor[0] == ']') {
+ parser->cursor++;
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class");
+ return true;
+ }
+
+ const uint8_t *reset = parser->cursor;
+
+ if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
+ parser->cursor++;
+ if (pm_regexp_parse_posix_class(parser)) return true;
+
+ parser->cursor = reset;
+ }
+
+ return pm_regexp_parse_character_set(parser, depth);
+}
+
+// Forward declaration here since parsing groups needs to go back up the grammar
+// to parse expressions within them.
+static bool
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth);
+
+/**
+ * These are the states of the options that are configurable on the regular
+ * expression (or from within a group).
+ */
+typedef enum {
+ PM_REGEXP_OPTION_STATE_INVALID,
+ PM_REGEXP_OPTION_STATE_TOGGLEABLE,
+ PM_REGEXP_OPTION_STATE_ADDABLE,
+ PM_REGEXP_OPTION_STATE_ADDED,
+ PM_REGEXP_OPTION_STATE_REMOVED
+} pm_regexp_option_state_t;
+
+// These are the options that are configurable on the regular expression (or
+// from within a group).
+
+/** The minimum character value for a regexp option slot. */
+#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
+
+/** The maximum character value for a regexp option slot. */
+#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
+
+/** The number of regexp option slots. */
+#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
+
+/**
+ * This is the set of options that are configurable on the regular expression.
+ */
+typedef struct {
+ /** The current state of each option. */
+ uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
+} pm_regexp_options_t;
+
+/**
+ * Initialize a new set of options to their default values.
+ */
+static void
+pm_regexp_options_init(pm_regexp_options_t *options) {
+ memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
+ options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
+ options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
+ options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
+ options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
+ options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
+ options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
+}
+
+/**
+ * Attempt to add the given option to the set of options. Returns true if it was
+ * added, false if it was already present.
+ */
+static bool
+pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
+
+ switch (options->values[key]) {
+ case PM_REGEXP_OPTION_STATE_INVALID:
+ case PM_REGEXP_OPTION_STATE_REMOVED:
+ return false;
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
+ options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
+ return true;
+ case PM_REGEXP_OPTION_STATE_ADDED:
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * Attempt to remove the given option from the set of options. Returns true if
+ * it was removed, false if it was already absent.
+ */
+static bool
+pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
+
+ switch (options->values[key]) {
+ case PM_REGEXP_OPTION_STATE_INVALID:
+ case PM_REGEXP_OPTION_STATE_ADDABLE:
+ return false;
+ case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
+ case PM_REGEXP_OPTION_STATE_ADDED:
+ case PM_REGEXP_OPTION_STATE_REMOVED:
+ options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * True if the given key is set in the options.
+ */
+static uint8_t
+pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) {
+ if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
+ key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
+ return options->values[key];
+ }
+
+ return false;
+}
+
+/**
+ * Groups can have quite a few different patterns for syntax. They basically
+ * just wrap a set of expressions, but they can potentially have options after a
+ * question mark. If there _isn't_ a question mark, then it's just a set of
+ * expressions. If there _is_, then here are the options:
+ *
+ * * (?#...) - inline comments
+ * * (?:subexp) - non-capturing group
+ * * (?=subexp) - positive lookahead
+ * * (?!subexp) - negative lookahead
+ * * (?>subexp) - atomic group
+ * * (?~subexp) - absence operator
+ * * (?<=subexp) - positive lookbehind
+ * * (?<!subexp) - negative lookbehind
+ * * (?<name>subexp) - named capturing group
+ * * (?'name'subexp) - named capturing group
+ * * (?(cond)yes-subexp) - conditional expression
+ * * (?(cond)yes-subexp|no-subexp) - conditional expression
+ * * (?imxdau-imx) - turn on and off configuration
+ * * (?imxdau-imx:subexp) - turn on and off configuration for an expression
+ */
+static bool
+pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) {
+ const uint8_t *group_start = parser->cursor;
+
+ pm_regexp_options_t options;
+ pm_regexp_options_init(&options);
+
+ // First, parse any options for the group.
+ if (pm_regexp_char_accept(parser, '?')) {
+ if (pm_regexp_char_is_eof(parser)) {
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
+ return false;
+ }
+
+ switch (*parser->cursor) {
+ case '#': { // inline comments
+ parser->cursor++;
+ if (pm_regexp_char_is_eof(parser)) {
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group");
+ return false;
+ }
+
+ if (parser->encoding_changed && parser->encoding->multibyte) {
+ bool escaped = false;
+
+ // Here we're going to take a slow path and iterate through
+ // each multibyte character to find the close paren. We do
+ // this because \ can be a trailing byte in some encodings.
+ while (parser->cursor < parser->end) {
+ if (!escaped && *parser->cursor == ')') {
+ parser->cursor++;
+ return true;
+ }
+
+ size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+ if (width == 0) {
+ if (*parser->cursor >= 0x80) {
+ parser->has_invalid_multibyte = true;
+ pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+ parser->cursor++;
+ continue;
+ }
+ return false;
+ }
+
+ escaped = (width == 1) && (*parser->cursor == '\\');
+ parser->cursor += width;
+ }
+
+ return false;
+ } else {
+ // Here we can take the fast path and use memchr to find the
+ // next ) because we are safe checking backward for \ since
+ // it cannot be a trailing character.
+ bool found = pm_regexp_char_find(parser, ')');
+
+ while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
+ found = pm_regexp_char_find(parser, ')');
+ }
+
+ return found;
+ }
+ }
+ case ':': // non-capturing group
+ case '=': // positive lookahead
+ case '!': // negative lookahead
+ case '>': // atomic group
+ case '~': // absence operator
+ parser->cursor++;
+ break;
+ case '<':
+ parser->cursor++;
+ if (pm_regexp_char_is_eof(parser)) {
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
+ return false;
+ }
+
+ switch (*parser->cursor) {
+ case '=': // positive lookbehind
+ case '!': // negative lookbehind
+ parser->cursor++;
+ break;
+ default: { // named capture group
+ const uint8_t *start = parser->cursor;
+ if (!pm_regexp_char_find(parser, '>')) {
+ return false;
+ }
+
+ if (parser->cursor - start == 1) {
+ pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty");
+ }
+
+ if (parser->name_callback != NULL) {
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ }
+
+ break;
+ }
+ }
+ break;
+ case '\'': { // named capture group
+ const uint8_t *start = ++parser->cursor;
+ if (!pm_regexp_char_find(parser, '\'')) {
+ return false;
+ }
+
+ if (parser->name_callback != NULL) {
+ pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
+ }
+
+ break;
+ }
+ case '(': // conditional expression
+ if (!pm_regexp_char_find(parser, ')')) {
+ return false;
+ }
+ break;
+ case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
+ if (!pm_regexp_options_add(&options, *parser->cursor)) {
+ return false;
+ }
+ parser->cursor++;
+ }
+
+ if (pm_regexp_char_is_eof(parser)) {
+ return false;
+ }
+
+ // If we are at the end of the group of options and there is no
+ // subexpression, then we are going to be setting the options
+ // for the parent group. In this case we are safe to return now.
+ if (*parser->cursor == ')') {
+ if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) {
+ parser->extended_mode = true;
+ }
+
+ parser->cursor++;
+ return true;
+ }
+
+ // If we hit a -, then we're done parsing options.
+ if (*parser->cursor != '-') break;
+
+ PRISM_FALLTHROUGH
+ case '-':
+ parser->cursor++;
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
+ if (!pm_regexp_options_remove(&options, *parser->cursor)) {
+ return false;
+ }
+ parser->cursor++;
+ }
+
+ if (pm_regexp_char_is_eof(parser)) {
+ return false;
+ }
+
+ // If we are at the end of the group of options and there is no
+ // subexpression, then we are going to be setting the options
+ // for the parent group. In this case we are safe to return now.
+ if (*parser->cursor == ')') {
+ switch (pm_regexp_options_state(&options, 'x')) {
+ case PM_REGEXP_OPTION_STATE_ADDED:
+ parser->extended_mode = true;
+ break;
+ case PM_REGEXP_OPTION_STATE_REMOVED:
+ parser->extended_mode = false;
+ break;
+ }
+
+ parser->cursor++;
+ return true;
+ }
+
+ break;
+ default:
+ parser->cursor++;
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option");
+ break;
+ }
+ }
+
+ bool extended_mode = parser->extended_mode;
+ switch (pm_regexp_options_state(&options, 'x')) {
+ case PM_REGEXP_OPTION_STATE_ADDED:
+ parser->extended_mode = true;
+ break;
+ case PM_REGEXP_OPTION_STATE_REMOVED:
+ parser->extended_mode = false;
+ break;
+ }
+
+ // Now, parse the expressions within this group.
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
+ if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) {
+ parser->extended_mode = extended_mode;
+ return false;
+ }
+ pm_regexp_char_accept(parser, '|');
+ }
+
+ // Finally, make sure we have a closing parenthesis.
+ parser->extended_mode = extended_mode;
+ if (pm_regexp_char_expect(parser, ')')) return true;
+
+ pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis");
+ return false;
+}
+
+/**
+ * item : anchor
+ * | match-posix-class
+ * | match-char-set
+ * | match-char-class
+ * | match-char-prop
+ * | match-char
+ * | match-any
+ * | group
+ * | quantified
+ * ;
+ */
+static bool
+pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) {
+ switch (*parser->cursor) {
+ case '^':
+ case '$':
+ parser->cursor++;
+ return pm_regexp_parse_quantifier(parser);
+ case '\\':
+ parser->cursor++;
+ pm_regexp_parse_backslash_escape(parser);
+ return pm_regexp_parse_quantifier(parser);
+ case '(':
+ parser->cursor++;
+ return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser);
+ case '[':
+ parser->cursor++;
+ return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser);
+ case '*':
+ case '?':
+ case '+':
+ parser->cursor++;
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified");
+ return true;
+ case ')':
+ parser->cursor++;
+ pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis");
+ return true;
+ case '#':
+ if (parser->extended_mode) {
+ if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end;
+ return true;
+ }
+ PRISM_FALLTHROUGH
+ default: {
+ size_t width;
+ if (!parser->encoding_changed) {
+ width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+ } else {
+ width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+ }
+
+ if (width == 0) {
+ if (*parser->cursor >= 0x80 && parser->encoding_changed) {
+ if (parser->encoding->multibyte) {
+ // Invalid multibyte character in a multibyte encoding.
+ // Emit the error at the byte location immediately.
+ parser->has_invalid_multibyte = true;
+ pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+ } else {
+ // Non-ASCII byte in a single-byte encoding (e.g.,
+ // US-ASCII). Count it for later error reporting.
+ parser->non_ascii_literal_count++;
+ }
+ parser->cursor++;
+ return pm_regexp_parse_quantifier(parser);
+ }
+ return false;
+ }
+
+ // Count non-ASCII literal bytes.
+ for (size_t i = 0; i < width; i++) {
+ if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++;
+ }
+
+ parser->cursor += width;
+ return pm_regexp_parse_quantifier(parser);
+ }
+ }
+}
+
+/**
+ * expression : item+
+ * ;
+ */
+static bool
+pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) {
+ if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) {
+ pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over");
+ return false;
+ }
+
+ if (!pm_regexp_parse_item(parser, depth)) {
+ return false;
+ }
+
+ while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
+ if (!pm_regexp_parse_item(parser, depth)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/**
+ * pattern : EOF
+ * | expression EOF
+ * | expression '|' pattern
+ * ;
+ */
+static bool
+pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
+ do {
+ if (pm_regexp_char_is_eof(parser)) return true;
+ if (!pm_regexp_parse_expression(parser, 0)) return false;
+ } while (pm_regexp_char_accept(parser, '|'));
+
+ return pm_regexp_char_is_eof(parser);
+}
+
+// ---------------------------------------------------------------------------
+// Encoding validation
+// ---------------------------------------------------------------------------
+
+/**
+ * Validate that groups of hex escape bytes in the buffer form valid multibyte
+ * characters in the given encoding. Groups are separated by 0x00 sentinels.
+ */
+static bool
+pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) {
+ const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer);
+ size_t len = pm_buffer_length(buffer);
+ size_t i = 0;
+
+ while (i < len) {
+ size_t group_start = i;
+ while (i < len && data[i] != 0x00) i++;
+
+ for (size_t j = group_start; j < i; ) {
+ size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j));
+ if (width == 0) return false;
+ j += width;
+ }
+
+ if (i < len) i++; // skip sentinel
+ }
+
+ return true;
+}
+
+/**
+ * Format regexp source content for use in error messages, hex-escaping
+ * non-ASCII bytes.
+ */
+static void
+pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) {
+ size_t index = 0;
+
+ if (encoding == PM_ENCODING_UTF_8_ENTRY) {
+ pm_buffer_append_string(buffer, (const char *) source, length);
+ return;
+ }
+
+ while (index < length) {
+ if (source[index] < 0x80) {
+ pm_buffer_append_byte(buffer, source[index]);
+ index++;
+ } else if (encoding->multibyte) {
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index));
+
+ if (width > 1) {
+ pm_buffer_append_string(buffer, "\\x{", 3);
+ for (size_t i = 0; i < width; i++) {
+ pm_buffer_append_format(buffer, "%02X", source[index + i]);
+ }
+ pm_buffer_append_byte(buffer, '}');
+ index += width;
+ } else {
+ pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+ index++;
+ }
+ } else {
+ pm_buffer_append_format(buffer, "\\x%02X", source[index]);
+ index++;
+ }
+ }
+}
+
+/**
+ * Emit an encoding validation error on the regexp node.
+ */
+#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \
+ pm_diagnostic_list_append_format( \
+ &(parser)->parser->metadata_arena, \
+ &(parser)->parser->error_list, \
+ (uint32_t) ((parser)->node_start - (parser)->parser->start), \
+ (uint32_t) ((parser)->node_end - (parser)->node_start), \
+ diag_id, __VA_ARGS__)
+
+/**
+ * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n).
+ *
+ * The decision tree is:
+ *
+ * 1. No escape-set encoding (explicit_encoding == NULL):
+ * a. ASCII-only content: validate property escapes, return forced US-ASCII
+ * for /n or the modifier flags for others.
+ * b. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ * c. Source encoding differs from modifier encoding: emit mismatch error.
+ *
+ * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending
+ * on the modifier and which escape type was last.
+ *
+ * 3. \u escape with non-/u modifier: incompatible encoding error.
+ *
+ * 4. Validate that hex escape byte sequences form valid multibyte characters
+ * in the modifier's encoding.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) {
+
+ if (parser->explicit_encoding == NULL) {
+ if (ascii_only) {
+ // Check property escapes against the modifier's encoding tier.
+ // /n (ASCII-8BIT): only POSIX properties are valid.
+ // /e, /s: POSIX and script properties are valid.
+ // /u: all properties are valid.
+ if (modifier == 'n' && parser->property_name != NULL) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+ (int) parser->property_name_length, (const char *) parser->property_name,
+ source_length, source_start);
+ } else if (modifier != 'u' && parser->has_unicode_property_escape) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY,
+ (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name,
+ source_length, source_start);
+ }
+ return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
+ }
+
+ if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+ for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+ }
+ } else if (parser->encoding != modifier_encoding) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name);
+
+ if (modifier == 'n' && !ascii_only) {
+ pm_buffer_t formatted = { 0 };
+ pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length);
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value);
+ pm_buffer_cleanup(&formatted);
+ }
+ }
+
+ return flags;
+ }
+
+ // Mixed unicode + hex escapes.
+ if (parser->has_unicode_escape && parser->has_hex_escape) {
+ if (modifier == 'n') {
+ if (parser->last_escape_was_unicode) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+ } else {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+ }
+ } else {
+ if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+ }
+ }
+
+ return flags;
+ }
+
+ if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+ if (parser->last_escape_was_unicode) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+ } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start);
+ }
+ }
+
+ if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+ }
+
+ return flags;
+}
+
+/**
+ * Validate encoding for a regexp without a modifier and compute the encoding
+ * flags to set on the node.
+ *
+ * The decision tree is:
+ *
+ * 1. If a modifier (/n, /u, /e, /s) is present, delegate to
+ * pm_regexp_validate_encoding_modifier.
+ * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors
+ * were already emitted during parsing).
+ * 3. US-ASCII source with non-ASCII literals: emit per-byte errors.
+ * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}).
+ * 5. Escape-set encoding present: validate hex escapes against the target
+ * encoding, handle mixed \u + \x conflicts, and return the appropriate
+ * forced encoding flag.
+ */
+static pm_node_flags_t
+pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) {
+
+ // Invalid multibyte characters suppress further validation.
+ // Errors were already emitted at the byte locations during parsing.
+ if (parser->has_invalid_multibyte) {
+ return flags;
+ }
+
+ if (parser->invalid_unicode_range) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start);
+ return flags;
+ }
+
+ // Check modifier flags first.
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
+ return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length);
+ }
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
+ return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length);
+ }
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
+ return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length);
+ }
+ if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
+ return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length);
+ }
+
+ // No modifier — check for non-ASCII literals in US-ASCII encoding.
+ if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
+ for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
+ }
+ }
+
+ // ASCII-only regexps get downgraded to US-ASCII, unless property escapes
+ // force UTF-8.
+ if (ascii_only) {
+ if (parser->has_property_escape) {
+ return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+ }
+ return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
+ }
+
+ // Check explicit encoding from escape sequences.
+ if (parser->explicit_encoding != NULL) {
+ // Mixed unicode + hex escapes without modifier.
+ if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) {
+ if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY &&
+ parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY &&
+ !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+ } else if (parser->last_escape_was_unicode) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start);
+ } else {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start);
+ }
+
+ return 0;
+ }
+
+ if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+ if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+ }
+
+ return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING;
+ } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) {
+ return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING;
+ } else {
+ if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) {
+ PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start);
+ }
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Parse a regular expression, validate its encoding, and optionally extract
+ * named capture groups. Encoding validation walks the raw source (content_loc)
+ * to distinguish escape-produced bytes from literal bytes. Named capture
+ * extraction walks the unescaped content since escape sequences in group names
+ * (e.g., line continuations) have already been processed by the lexer.
+ */
+pm_node_flags_t
+pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+ const uint8_t *source = parser->start + node->content_loc.start;
+ size_t size = node->content_loc.length;
+ bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED);
+ pm_node_flags_t flags = PM_NODE_FLAGS(node);
+
+ const uint8_t *node_start = parser->start + node->base.location.start;
+ const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length;
+
+ // First pass: walk raw source for encoding validation (no name extraction).
+ pm_regexp_parser_t regexp_parser = {
+ .parser = parser,
+ .start = source,
+ .cursor = source,
+ .end = source + size,
+ .extended_mode = extended_mode,
+ .encoding_changed = parser->encoding_changed,
+ .encoding = parser->encoding,
+ .name_callback = NULL,
+ .name_data = NULL,
+ .shared = true,
+ .node_start = node_start,
+ .node_end = node_end,
+ .has_unicode_escape = false,
+ .has_hex_escape = false,
+ .last_escape_was_unicode = false,
+ .explicit_encoding = NULL,
+ .has_property_escape = false,
+ .has_unicode_property_escape = false,
+ .property_name = NULL,
+ .property_name_length = 0,
+ .unicode_property_name = NULL,
+ .unicode_property_name_length = 0,
+ .non_ascii_literal_count = 0,
+ .invalid_unicode_range = false,
+ .hex_escape_buffer = { 0 },
+ .hex_group_active = false,
+ .has_invalid_multibyte = false,
+ };
+
+ pm_regexp_parse_pattern(&regexp_parser);
+
+ // Compute ascii_only from the regexp parser's tracked state. We cannot
+ // use node->unescaped for this because regexp unescaped content preserves
+ // escape text (e.g., \x80 is 4 ASCII chars), not the binary values.
+ bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0;
+ // Use the unescaped content for error messages to match CRuby's format,
+ // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes
+ // like \u{80} are preserved as text.
+ const char *error_source = (const char *) pm_string_source(&node->unescaped);
+ int error_source_length = (int) pm_string_length(&node->unescaped);
+ pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(&regexp_parser, ascii_only, flags, error_source, error_source_length);
+ pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
+
+ // Second pass: walk unescaped content for named capture extraction.
+ if (name_callback != NULL) {
+ bool shared = node->unescaped.type == PM_STRING_SHARED;
+ pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data);
+ }
+
+ return encoding_flags;
+}
+
+/**
+ * Parse an interpolated regular expression for named capture groups only.
+ * This is used for the =~ operator with interpolated regexps where we don't
+ * have a pm_regular_expression_node_t. No encoding validation is performed.
+ *
+ * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.)
+ * are initialized but not used for the result. They exist because the parsing
+ * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update
+ * them as they walk through the content.
+ */
+void
+pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) {
+ pm_regexp_parser_t regexp_parser = {
+ .parser = parser,
+ .start = source,
+ .cursor = source,
+ .end = source + size,
+ .extended_mode = extended_mode,
+ .encoding_changed = parser->encoding_changed,
+ .encoding = parser->encoding,
+ .name_callback = name_callback,
+ .name_data = name_data,
+ .shared = shared,
+ .node_start = source,
+ .node_end = source + size,
+ .has_unicode_escape = false,
+ .has_hex_escape = false,
+ .last_escape_was_unicode = false,
+ .explicit_encoding = NULL,
+ .has_property_escape = false,
+ .has_unicode_property_escape = false,
+ .property_name = NULL,
+ .property_name_length = 0,
+ .unicode_property_name = NULL,
+ .unicode_property_name_length = 0,
+ .non_ascii_literal_count = 0,
+ .invalid_unicode_range = false,
+ .hex_escape_buffer = { 0 },
+ .hex_group_active = false,
+ .has_invalid_multibyte = false,
+ };
+
+ pm_regexp_parse_pattern(&regexp_parser);
+ pm_buffer_cleanup(&regexp_parser.hex_escape_buffer);
+}