diff options
Diffstat (limited to 'prism/regexp.c')
| -rw-r--r-- | prism/regexp.c | 1717 |
1 files changed, 1717 insertions, 0 deletions
diff --git a/prism/regexp.c b/prism/regexp.c new file mode 100644 index 0000000000..cc17aa4d09 --- /dev/null +++ b/prism/regexp.c @@ -0,0 +1,1717 @@ +#include "prism/internal/regexp.h" + +#include "prism/compiler/inline.h" +#include "prism/compiler/fallthrough.h" +#include "prism/internal/buffer.h" +#include "prism/internal/char.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/memchr.h" +#include "prism/internal/parser.h" +#include "prism/internal/stringy.h" +#include "prism/internal/strncasecmp.h" + +#include <assert.h> +#include <string.h> + +/** The maximum depth of nested groups allowed in a regular expression. */ +#define PM_REGEXP_PARSE_DEPTH_MAX 4096 + +/** + * This is the parser that is going to handle parsing regular expressions. + */ +typedef struct { + /** The parser that is currently being used. */ + pm_parser_t *parser; + + /** A pointer to the start of the source that we are parsing. */ + const uint8_t *start; + + /** A pointer to the current position in the source. */ + const uint8_t *cursor; + + /** A pointer to the end of the source that we are parsing. */ + const uint8_t *end; + + /** The encoding of the source. */ + const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + pm_regexp_name_data_t *name_data; + + /** The start of the regexp node (for error locations). */ + const uint8_t *node_start; + + /** The end of the regexp node (for error locations). */ + const uint8_t *node_end; + + /** + * The explicit encoding determined by escape sequences. NULL if no + * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the + * source encoding for `\x` escapes. + */ + const pm_encoding_t *explicit_encoding; + + /** + * Pointer to the first non-POSIX property name (for /n error messages). + * POSIX properties (Alnum, Alpha, etc.) work in all encodings. + * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u. + * Unicode-only properties (L, Ll, etc.) work only in /u. + */ + const uint8_t *property_name; + + /** Length of the first non-POSIX property name found. */ + size_t property_name_length; + + /** + * Pointer to the first Unicode-only property name (for /e, /s error + * messages). NULL if only POSIX or script properties have been seen. + */ + const uint8_t *unicode_property_name; + + /** Length of the first Unicode-only property name found. */ + size_t unicode_property_name_length; + + /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */ + pm_buffer_t hex_escape_buffer; + + /** Count of non-ASCII literal bytes (not from escapes). */ + uint32_t non_ascii_literal_count; + + /** + * Whether or not the regular expression currently being parsed is in + * extended mode, wherein whitespace is ignored and comments are allowed. + */ + bool extended_mode; + + /** Whether the encoding has changed from the default. */ + bool encoding_changed; + + /** Whether the source content is shared (for named capture callback). */ + bool shared; + + /** Whether a `\u{...}` escape with value >= 0x80 was seen. */ + bool has_unicode_escape; + + /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */ + bool has_hex_escape; + + /** + * Tracks whether the last encoding-setting escape was `\u` (true) or `\x` + * (false). This matters for error messages when both types are mixed. + */ + bool last_escape_was_unicode; + + /** Whether any `\p{...}` or `\P{...}` property escape was found. */ + bool has_property_escape; + + /** Whether a Unicode-only property escape was found (not POSIX or script). */ + bool has_unicode_property_escape; + + /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */ + bool invalid_unicode_range; + + /** Whether we are accumulating consecutive hex escape bytes. */ + bool hex_group_active; + + /** Whether an invalid multibyte character was found during parsing. */ + bool has_invalid_multibyte; +} pm_regexp_parser_t; + +/** + * Append a syntax error to the parser's error list. If the source is shared + * (points into the original source), we can point to the exact error location. + * Otherwise, we point to the whole regexp node. + */ +static PRISM_INLINE void +pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { + pm_parser_t *pm = parser->parser; + uint32_t loc_start, loc_length; + + if (parser->shared) { + loc_start = (uint32_t) (start - pm->start); + loc_length = (uint32_t) (end - start); + } else { + loc_start = (uint32_t) (parser->node_start - pm->start); + loc_length = (uint32_t) (parser->node_end - parser->node_start); + } + + pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message); +} + +/** + * Append a formatted diagnostic error with proper shared/non-shared location + * handling. This is a macro because we need variadic args for the format string. + */ +#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \ + do { \ + pm_parser_t *pm__ = (parser_)->parser; \ + uint32_t loc_start__, loc_length__; \ + if ((parser_)->shared) { \ + loc_start__ = (uint32_t) ((err_start_) - pm__->start); \ + loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \ + } else { \ + loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \ + loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \ + } \ + pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \ + } while (0) + +/** + * This appends a new string to the list of named captures. This function + * assumes the caller has already checked the validity of the name callback. + */ +static void +pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_string_t string; + pm_string_shared_init(&string, start, end); + parser->name_callback(parser->parser, &string, parser->shared, parser->name_data); + pm_string_cleanup(&string); +} + +/** + * Returns true if the next character is the end of the source. + */ +static PRISM_INLINE bool +pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { + return parser->cursor >= parser->end; +} + +/** + * Optionally accept a char and consume it if it exists. + */ +static PRISM_INLINE bool +pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { + if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { + parser->cursor++; + return true; + } + return false; +} + +/** + * Expect a character to be present and consume it. + */ +static PRISM_INLINE bool +pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { + if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { + parser->cursor++; + return true; + } + return false; +} + +/** + * This advances the current token to the next instance of the given character. + */ +static bool +pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding); + if (end == NULL) { + return false; + } + + parser->cursor = end + 1; + return true; +} + +/** + * Mark a group boundary in the hex escape byte buffer. When consecutive hex + * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00 + * sentinel to separate the groups for later multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) { + if (parser->hex_group_active) { + pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00); + parser->hex_group_active = false; + } +} + +/** + * Track a hex escape byte value >= 0x80 for multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) { + if (byte >= 0x80) { + pm_buffer_append_byte(&parser->hex_escape_buffer, byte); + parser->hex_group_active = true; + parser->has_hex_escape = true; + + parser->explicit_encoding = parser->encoding; + parser->last_escape_was_unicode = false; + } else { + pm_regexp_hex_group_boundary(parser); + } +} + +/** + * Parse a hex digit character and return its value, or -1 if not a hex digit. + */ +static PRISM_INLINE int +pm_regexp_hex_digit_value(uint8_t byte) { + if (byte >= '0' && byte <= '9') return byte - '0'; + if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10; + if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10; + return -1; +} + +/** + * Range quantifiers are a special class of quantifiers that look like + * + * * {digit} + * * {digit,} + * * {digit,digit} + * * {,digit} + * + * If there are any spaces in between, then this just becomes a regular + * character match expression and we have to backtrack. So when this function + * first starts running, we'll create a "save" point and then attempt to parse + * the quantifier. If it fails, we'll restore the save point and return. + * + * To properly track everything, we're going to build a little state machine. + * It looks something like the following: + * + * +-------+ +---------+ ------------+ + * ---- lbrace ---> | start | ---- digit ---> | minimum | | + * +-------+ +---------+ <--- digit -+ + * | | | + * +-------+ | | rbrace + * | comma | <----- comma +---- comma -------+ | + * +-------+ V V + * | +---------+ +---------+ + * +-- digit --> | maximum | -- rbrace --> || final || + * +---------+ +---------+ + * | ^ + * +- digit -+ + * + * Note that by the time we've hit this function, the lbrace has already been + * consumed so we're in the start state. + */ +static bool +pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) { + const uint8_t *savepoint = parser->cursor; + + enum { + PM_REGEXP_RANGE_QUANTIFIER_STATE_START, + PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM, + PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM, + PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA + } state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START; + + while (1) { + if (parser->cursor >= parser->end) { + parser->cursor = savepoint; + return true; + } + + switch (state) { + case PM_REGEXP_RANGE_QUANTIFIER_STATE_START: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM; + break; + case ',': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA; + break; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + break; + case ',': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM; + break; + case '}': + parser->cursor++; + return true; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM; + break; + default: + parser->cursor = savepoint; + return true; + } + break; + case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM: + switch (*parser->cursor) { + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': + parser->cursor++; + break; + case '}': + parser->cursor++; + return true; + default: + parser->cursor = savepoint; + return true; + } + break; + } + } + + return true; +} + +/** + * quantifier : star-quantifier + * | plus-quantifier + * | optional-quantifier + * | range-quantifier + * | <empty> + * ; + */ +static bool +pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) { + while (!pm_regexp_char_is_eof(parser)) { + switch (*parser->cursor) { + case '*': + case '+': + case '?': + parser->cursor++; + break; + case '{': + parser->cursor++; + if (!pm_regexp_parse_range_quantifier(parser)) return false; + break; + default: + // In this case there is no quantifier. + return true; + } + } + + return true; +} + +/** + * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']' + * ; + */ +static bool +pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { + if (!pm_regexp_char_expect(parser, ':')) { + return false; + } + + pm_regexp_char_accept(parser, '^'); + + return ( + pm_regexp_char_find(parser, ':') && + pm_regexp_char_expect(parser, ']') && + pm_regexp_char_expect(parser, ']') + ); +} + +/** + * Property escape classification. Onigmo supports three tiers of property + * names depending on the encoding: + * + * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower, + * Print, Punct, Space, Upper, XDigit, Word): valid in all encodings. + * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid + * in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n). + * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus + * Any, Assigned): valid only in UTF-8 (/u). + */ +typedef enum { + PM_REGEXP_PROPERTY_POSIX, + PM_REGEXP_PROPERTY_SCRIPT, + PM_REGEXP_PROPERTY_UNICODE +} pm_regexp_property_type_t; + +/** + * Classify a property name. The name may start with '^' for negation, which + * is skipped before matching. + */ +static pm_regexp_property_type_t +pm_regexp_classify_property(const uint8_t *name, size_t length) { + // Skip leading '^' for negated properties like \p{^Hiragana}. + if (length > 0 && name[0] == '^') { + name++; + length--; + } + +#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0) + + switch (length) { + case 3: + if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 4: + if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 5: + /* Most properties are length 5, so dispatch on first character. */ + switch (name[0] | 0x20) { + case 'a': + if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'b': + if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'c': + if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'd': + if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'g': + if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'l': + if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'p': + if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 's': + if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'u': + if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX; + break; + } + break; + case 6: + if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 8: + if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + } + +#undef PM_REGEXP_CASECMP + + // Everything else is Unicode-only (general categories, other scripts, etc.). + return PM_REGEXP_PROPERTY_UNICODE; +} + +/** + * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The + * cursor should be pointing at 'p' or 'P' when this is called. If a property + * escape is found, record it on the regexp parser and advance past the closing + * '}'. + * + * Properties are classified into three tiers (POSIX, script, Unicode-only) to + * determine which encoding modifiers they are valid with. + */ +static bool +pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) { + assert(*parser->cursor == 'p' || *parser->cursor == 'P'); + + if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') { + const uint8_t *name_start = parser->cursor + 2; + const uint8_t *search = name_start; + + while (search < parser->end && *search != '}') search++; + + if (search < parser->end) { + size_t name_length = (size_t) (search - name_start); + parser->has_property_escape = true; + + pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length); + + // Track the first non-POSIX property name (for /n error messages). + if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) { + parser->property_name = name_start; + parser->property_name_length = name_length; + } + + // Track the first Unicode-only property name (for /e, /s error messages). + if (type == PM_REGEXP_PROPERTY_UNICODE) { + parser->has_unicode_property_escape = true; + if (parser->unicode_property_name == NULL) { + parser->unicode_property_name = name_start; + parser->unicode_property_name_length = name_length; + } + } + + parser->cursor = search + 1; // skip past '}' + return true; + } + } + + // Not a property escape, just skip the single character after '\'. + parser->cursor++; + return false; +} + +/** + * Validate and skip a \u escape sequence in a regular expression. The cursor + * should be pointing at the character after 'u' when this is called. This + * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding + * state for validation. + */ +static void +pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) { + const uint8_t *escape_start = parser->cursor - 2; // points to '\' + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + return; + } + + if (*parser->cursor == '{') { + parser->cursor++; // skip '{' + + // Skip leading whitespace. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + + bool has_codepoint = false; + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + // Parse the hex digits to compute the codepoint value. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count == 0) { + // Skip to '}' or end of regexp to find the full extent. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + parser->cursor++; + } + + const uint8_t *escape_end = parser->cursor; + if (!pm_regexp_char_is_eof(parser)) { + escape_end++; + parser->cursor++; // skip '}' + } + + pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start); + return; + } + + if (hex_count > 6) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range"); + } + + // Track encoding state for this codepoint. + if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range (surrogates or > 0x10FFFF). + if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) { + parser->invalid_unicode_range = true; + } + + has_codepoint = true; + + // Skip whitespace between codepoints. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + } + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape"); + } else { + if (!has_codepoint) { + pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start); + } + parser->cursor++; // skip '}' + } + } else { + // \uNNNN form — need exactly 4 hex digits. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count < 4) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + } else if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range. + if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) { + parser->invalid_unicode_range = true; + } + } +} + +// Forward declaration because character sets can be nested. +static bool +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); + +/** + * Parse a \x escape and return the byte value. The cursor should be pointing + * at the character after 'x'. Returns -1 if no hex digits follow. + */ +static int +pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) { + int value = -1; + + if (!pm_regexp_char_is_eof(parser)) { + int digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = digit; + parser->cursor++; + + if (!pm_regexp_char_is_eof(parser)) { + digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = (value << 4) | digit; + parser->cursor++; + } + } + } + } + + if (value >= 0) { + pm_regexp_track_hex_escape(parser, (uint8_t) value); + } + + return value; +} + +/** + * Parse a backslash escape sequence in a regexp, handling \u (unicode), + * \p/\P (property), \x (hex), and other single-character escapes. Also + * tracks encoding state for \M-x and \C-\M-x escapes. + */ +static void +pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) { + if (pm_regexp_char_is_eof(parser)) return; + + switch (*parser->cursor) { + case 'u': + parser->cursor++; // skip 'u' + pm_regexp_parse_unicode_escape(parser); + break; + case 'p': + case 'P': + pm_regexp_parse_property_escape(parser); + break; + case 'x': + parser->cursor++; // skip 'x' + pm_regexp_parse_hex_escape(parser); + break; + case 'M': + // \M-x produces (x | 0x80), always >= 0x80 + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'M-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80 + // We just need to track it as a hex escape >= 0x80. + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + // \M-x always produces a byte >= 0x80 + pm_regexp_track_hex_escape(parser, 0x80); + } + } else { + parser->cursor++; + } + break; + case 'C': + // \C-x produces (x & 0x1F) + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'C-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + } else { + parser->cursor++; + } + break; + case 'c': + // \cx produces (x & 0x1F) + parser->cursor++; // skip 'c' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + break; + default: + pm_regexp_hex_group_boundary(parser); + parser->cursor++; + break; + } +} + +/** + * Check if a byte at the current position is a non-ASCII byte in a multibyte + * encoding that produces an invalid character. If so, emit an error at the + * byte location immediately. + */ +static void +pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) { + uint8_t byte = *cursor; + if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) { + size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor)); + if (width > 1) { + parser->cursor += width - 1; + } else if (width == 0) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } +} + +/** + * match-char-set : '[' '^'? (match-range | match-char)* ']' + * ; + */ +static bool +pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { + pm_regexp_char_accept(parser, '^'); + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') { + switch (*parser->cursor++) { + case '[': + pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1)); + break; + case '\\': + pm_regexp_parse_backslash_escape(parser); + break; + default: + // We've already advanced the cursor by one byte. If the byte + // was >= 0x80 in a multibyte encoding, we may need to consume + // additional continuation bytes and validate the character. + if (*(parser->cursor - 1) >= 0x80) { + parser->non_ascii_literal_count++; + } + pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1); + break; + } + } + + return pm_regexp_char_expect(parser, ']'); +} + +/** + * A left bracket can either mean a POSIX class or a character set. + */ +static bool +pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + + if ((parser->cursor < parser->end) && parser->cursor[0] == ']') { + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "empty char-class"); + return true; + } + + const uint8_t *reset = parser->cursor; + + if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') { + parser->cursor++; + if (pm_regexp_parse_posix_class(parser)) return true; + + parser->cursor = reset; + } + + return pm_regexp_parse_character_set(parser, depth); +} + +// Forward declaration here since parsing groups needs to go back up the grammar +// to parse expressions within them. +static bool +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth); + +/** + * These are the states of the options that are configurable on the regular + * expression (or from within a group). + */ +typedef enum { + PM_REGEXP_OPTION_STATE_INVALID, + PM_REGEXP_OPTION_STATE_TOGGLEABLE, + PM_REGEXP_OPTION_STATE_ADDABLE, + PM_REGEXP_OPTION_STATE_ADDED, + PM_REGEXP_OPTION_STATE_REMOVED +} pm_regexp_option_state_t; + +// These are the options that are configurable on the regular expression (or +// from within a group). + +/** The minimum character value for a regexp option slot. */ +#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' + +/** The maximum character value for a regexp option slot. */ +#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' + +/** The number of regexp option slots. */ +#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) + +/** + * This is the set of options that are configurable on the regular expression. + */ +typedef struct { + /** The current state of each option. */ + uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS]; +} pm_regexp_options_t; + +/** + * Initialize a new set of options to their default values. + */ +static void +pm_regexp_options_init(pm_regexp_options_t *options) { + memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS); + options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE; + options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; + options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; + options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE; +} + +/** + * Attempt to add the given option to the set of options. Returns true if it was + * added, false if it was already present. + */ +static bool +pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + + switch (options->values[key]) { + case PM_REGEXP_OPTION_STATE_INVALID: + case PM_REGEXP_OPTION_STATE_REMOVED: + return false; + case PM_REGEXP_OPTION_STATE_TOGGLEABLE: + case PM_REGEXP_OPTION_STATE_ADDABLE: + options->values[key] = PM_REGEXP_OPTION_STATE_ADDED; + return true; + case PM_REGEXP_OPTION_STATE_ADDED: + return true; + } + } + + return false; +} + +/** + * Attempt to remove the given option from the set of options. Returns true if + * it was removed, false if it was already absent. + */ +static bool +pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + + switch (options->values[key]) { + case PM_REGEXP_OPTION_STATE_INVALID: + case PM_REGEXP_OPTION_STATE_ADDABLE: + return false; + case PM_REGEXP_OPTION_STATE_TOGGLEABLE: + case PM_REGEXP_OPTION_STATE_ADDED: + case PM_REGEXP_OPTION_STATE_REMOVED: + options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED; + return true; + } + } + + return false; +} + +/** + * True if the given key is set in the options. + */ +static uint8_t +pm_regexp_options_state(pm_regexp_options_t *options, uint8_t key) { + if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) { + key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM); + return options->values[key]; + } + + return false; +} + +/** + * Groups can have quite a few different patterns for syntax. They basically + * just wrap a set of expressions, but they can potentially have options after a + * question mark. If there _isn't_ a question mark, then it's just a set of + * expressions. If there _is_, then here are the options: + * + * * (?#...) - inline comments + * * (?:subexp) - non-capturing group + * * (?=subexp) - positive lookahead + * * (?!subexp) - negative lookahead + * * (?>subexp) - atomic group + * * (?~subexp) - absence operator + * * (?<=subexp) - positive lookbehind + * * (?<!subexp) - negative lookbehind + * * (?<name>subexp) - named capturing group + * * (?'name'subexp) - named capturing group + * * (?(cond)yes-subexp) - conditional expression + * * (?(cond)yes-subexp|no-subexp) - conditional expression + * * (?imxdau-imx) - turn on and off configuration + * * (?imxdau-imx:subexp) - turn on and off configuration for an expression + */ +static bool +pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { + const uint8_t *group_start = parser->cursor; + + pm_regexp_options_t options; + pm_regexp_options_init(&options); + + // First, parse any options for the group. + if (pm_regexp_char_accept(parser, '?')) { + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); + return false; + } + + switch (*parser->cursor) { + case '#': { // inline comments + parser->cursor++; + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern in group"); + return false; + } + + if (parser->encoding_changed && parser->encoding->multibyte) { + bool escaped = false; + + // Here we're going to take a slow path and iterate through + // each multibyte character to find the close paren. We do + // this because \ can be a trailing byte in some encodings. + while (parser->cursor < parser->end) { + if (!escaped && *parser->cursor == ')') { + parser->cursor++; + return true; + } + + size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + if (width == 0) { + if (*parser->cursor >= 0x80) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + parser->cursor++; + continue; + } + return false; + } + + escaped = (width == 1) && (*parser->cursor == '\\'); + parser->cursor += width; + } + + return false; + } else { + // Here we can take the fast path and use memchr to find the + // next ) because we are safe checking backward for \ since + // it cannot be a trailing character. + bool found = pm_regexp_char_find(parser, ')'); + + while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) { + found = pm_regexp_char_find(parser, ')'); + } + + return found; + } + } + case ':': // non-capturing group + case '=': // positive lookahead + case '!': // negative lookahead + case '>': // atomic group + case '~': // absence operator + parser->cursor++; + break; + case '<': + parser->cursor++; + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis"); + return false; + } + + switch (*parser->cursor) { + case '=': // positive lookbehind + case '!': // negative lookbehind + parser->cursor++; + break; + default: { // named capture group + const uint8_t *start = parser->cursor; + if (!pm_regexp_char_find(parser, '>')) { + return false; + } + + if (parser->cursor - start == 1) { + pm_regexp_parse_error(parser, start, parser->cursor, "group name is empty"); + } + + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + + break; + } + } + break; + case '\'': { // named capture group + const uint8_t *start = ++parser->cursor; + if (!pm_regexp_char_find(parser, '\'')) { + return false; + } + + if (parser->name_callback != NULL) { + pm_regexp_parser_named_capture(parser, start, parser->cursor - 1); + } + + break; + } + case '(': // conditional expression + if (!pm_regexp_char_find(parser, ')')) { + return false; + } + break; + case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') { + if (!pm_regexp_options_add(&options, *parser->cursor)) { + return false; + } + parser->cursor++; + } + + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + if (pm_regexp_options_state(&options, 'x') == PM_REGEXP_OPTION_STATE_ADDED) { + parser->extended_mode = true; + } + + parser->cursor++; + return true; + } + + // If we hit a -, then we're done parsing options. + if (*parser->cursor != '-') break; + + PRISM_FALLTHROUGH + case '-': + parser->cursor++; + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') { + if (!pm_regexp_options_remove(&options, *parser->cursor)) { + return false; + } + parser->cursor++; + } + + if (pm_regexp_char_is_eof(parser)) { + return false; + } + + // If we are at the end of the group of options and there is no + // subexpression, then we are going to be setting the options + // for the parent group. In this case we are safe to return now. + if (*parser->cursor == ')') { + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + + parser->cursor++; + return true; + } + + break; + default: + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "undefined group option"); + break; + } + } + + bool extended_mode = parser->extended_mode; + switch (pm_regexp_options_state(&options, 'x')) { + case PM_REGEXP_OPTION_STATE_ADDED: + parser->extended_mode = true; + break; + case PM_REGEXP_OPTION_STATE_REMOVED: + parser->extended_mode = false; + break; + } + + // Now, parse the expressions within this group. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') { + if (!pm_regexp_parse_expression(parser, (uint16_t) (depth + 1))) { + parser->extended_mode = extended_mode; + return false; + } + pm_regexp_char_accept(parser, '|'); + } + + // Finally, make sure we have a closing parenthesis. + parser->extended_mode = extended_mode; + if (pm_regexp_char_expect(parser, ')')) return true; + + pm_regexp_parse_error(parser, group_start, parser->cursor, "end pattern with unmatched parenthesis"); + return false; +} + +/** + * item : anchor + * | match-posix-class + * | match-char-set + * | match-char-class + * | match-char-prop + * | match-char + * | match-any + * | group + * | quantified + * ; + */ +static bool +pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { + switch (*parser->cursor) { + case '^': + case '$': + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + case '\\': + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + return pm_regexp_parse_quantifier(parser); + case '(': + parser->cursor++; + return pm_regexp_parse_group(parser, depth) && pm_regexp_parse_quantifier(parser); + case '[': + parser->cursor++; + return pm_regexp_parse_lbracket(parser, depth) && pm_regexp_parse_quantifier(parser); + case '*': + case '?': + case '+': + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "target of repeat operator is not specified"); + return true; + case ')': + parser->cursor++; + pm_regexp_parse_error(parser, parser->cursor - 1, parser->cursor, "unmatched close parenthesis"); + return true; + case '#': + if (parser->extended_mode) { + if (!pm_regexp_char_find(parser, '\n')) parser->cursor = parser->end; + return true; + } + PRISM_FALLTHROUGH + default: { + size_t width; + if (!parser->encoding_changed) { + width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } else { + width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); + } + + if (width == 0) { + if (*parser->cursor >= 0x80 && parser->encoding_changed) { + if (parser->encoding->multibyte) { + // Invalid multibyte character in a multibyte encoding. + // Emit the error at the byte location immediately. + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } else { + // Non-ASCII byte in a single-byte encoding (e.g., + // US-ASCII). Count it for later error reporting. + parser->non_ascii_literal_count++; + } + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + } + return false; + } + + // Count non-ASCII literal bytes. + for (size_t i = 0; i < width; i++) { + if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++; + } + + parser->cursor += width; + return pm_regexp_parse_quantifier(parser); + } + } +} + +/** + * expression : item+ + * ; + */ +static bool +pm_regexp_parse_expression(pm_regexp_parser_t *parser, uint16_t depth) { + if (depth >= PM_REGEXP_PARSE_DEPTH_MAX) { + pm_regexp_parse_error(parser, parser->start, parser->end, "parse depth limit over"); + return false; + } + + if (!pm_regexp_parse_item(parser, depth)) { + return false; + } + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') { + if (!pm_regexp_parse_item(parser, depth)) { + return false; + } + } + + return true; +} + +/** + * pattern : EOF + * | expression EOF + * | expression '|' pattern + * ; + */ +static bool +pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { + do { + if (pm_regexp_char_is_eof(parser)) return true; + if (!pm_regexp_parse_expression(parser, 0)) return false; + } while (pm_regexp_char_accept(parser, '|')); + + return pm_regexp_char_is_eof(parser); +} + +// --------------------------------------------------------------------------- +// Encoding validation +// --------------------------------------------------------------------------- + +/** + * Validate that groups of hex escape bytes in the buffer form valid multibyte + * characters in the given encoding. Groups are separated by 0x00 sentinels. + */ +static bool +pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) { + const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer); + size_t len = pm_buffer_length(buffer); + size_t i = 0; + + while (i < len) { + size_t group_start = i; + while (i < len && data[i] != 0x00) i++; + + for (size_t j = group_start; j < i; ) { + size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j)); + if (width == 0) return false; + j += width; + } + + if (i < len) i++; // skip sentinel + } + + return true; +} + +/** + * Format regexp source content for use in error messages, hex-escaping + * non-ASCII bytes. + */ +static void +pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) { + size_t index = 0; + + if (encoding == PM_ENCODING_UTF_8_ENTRY) { + pm_buffer_append_string(buffer, (const char *) source, length); + return; + } + + while (index < length) { + if (source[index] < 0x80) { + pm_buffer_append_byte(buffer, source[index]); + index++; + } else if (encoding->multibyte) { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index)); + + if (width > 1) { + pm_buffer_append_string(buffer, "\\x{", 3); + for (size_t i = 0; i < width; i++) { + pm_buffer_append_format(buffer, "%02X", source[index + i]); + } + pm_buffer_append_byte(buffer, '}'); + index += width; + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } +} + +/** + * Emit an encoding validation error on the regexp node. + */ +#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \ + pm_diagnostic_list_append_format( \ + &(parser)->parser->metadata_arena, \ + &(parser)->parser->error_list, \ + (uint32_t) ((parser)->node_start - (parser)->parser->start), \ + (uint32_t) ((parser)->node_end - (parser)->node_start), \ + diag_id, __VA_ARGS__) + +/** + * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n). + * + * The decision tree is: + * + * 1. No escape-set encoding (explicit_encoding == NULL): + * a. ASCII-only content: validate property escapes, return forced US-ASCII + * for /n or the modifier flags for others. + * b. US-ASCII source with non-ASCII literals: emit per-byte errors. + * c. Source encoding differs from modifier encoding: emit mismatch error. + * + * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending + * on the modifier and which escape type was last. + * + * 3. \u escape with non-/u modifier: incompatible encoding error. + * + * 4. Validate that hex escape byte sequences form valid multibyte characters + * in the modifier's encoding. + */ +static pm_node_flags_t +pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) { + + if (parser->explicit_encoding == NULL) { + if (ascii_only) { + // Check property escapes against the modifier's encoding tier. + // /n (ASCII-8BIT): only POSIX properties are valid. + // /e, /s: POSIX and script properties are valid. + // /u: all properties are valid. + if (modifier == 'n' && parser->property_name != NULL) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->property_name_length, (const char *) parser->property_name, + source_length, source_start); + } else if (modifier != 'u' && parser->has_unicode_property_escape) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name, + source_length, source_start); + } + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + pm_buffer_t formatted = { 0 }; + pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length); + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value); + pm_buffer_cleanup(&formatted); + } + } + + return flags; + } + + // Mixed unicode + hex escapes. + if (parser->has_unicode_escape && parser->has_hex_escape) { + if (modifier == 'n') { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + } else { + if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + + return flags; + } + + if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } + } + + if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return flags; +} + +/** + * Validate encoding for a regexp without a modifier and compute the encoding + * flags to set on the node. + * + * The decision tree is: + * + * 1. If a modifier (/n, /u, /e, /s) is present, delegate to + * pm_regexp_validate_encoding_modifier. + * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors + * were already emitted during parsing). + * 3. US-ASCII source with non-ASCII literals: emit per-byte errors. + * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}). + * 5. Escape-set encoding present: validate hex escapes against the target + * encoding, handle mixed \u + \x conflicts, and return the appropriate + * forced encoding flag. + */ +static pm_node_flags_t +pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) { + + // Invalid multibyte characters suppress further validation. + // Errors were already emitted at the byte locations during parsing. + if (parser->has_invalid_multibyte) { + return flags; + } + + if (parser->invalid_unicode_range) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start); + return flags; + } + + // Check modifier flags first. + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length); + } + + // No modifier — check for non-ASCII literals in US-ASCII encoding. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } + + // ASCII-only regexps get downgraded to US-ASCII, unless property escapes + // force UTF-8. + if (ascii_only) { + if (parser->has_property_escape) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // Check explicit encoding from escape sequences. + if (parser->explicit_encoding != NULL) { + // Mixed unicode + hex escapes without modifier. + if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY && + parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY && + !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } else if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + + return 0; + } + + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } else { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + } + + return 0; +} + +/** + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Encoding validation walks the raw source (content_loc) + * to distinguish escape-produced bytes from literal bytes. Named capture + * extraction walks the unescaped content since escape sequences in group names + * (e.g., line continuations) have already been processed by the lexer. + */ +pm_node_flags_t +pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + const uint8_t *source = parser->start + node->content_loc.start; + size_t size = node->content_loc.length; + bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED); + pm_node_flags_t flags = PM_NODE_FLAGS(node); + + const uint8_t *node_start = parser->start + node->base.location.start; + const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length; + + // First pass: walk raw source for encoding validation (no name extraction). + pm_regexp_parser_t regexp_parser = { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = NULL, + .name_data = NULL, + .shared = true, + .node_start = node_start, + .node_end = node_end, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + + // Compute ascii_only from the regexp parser's tracked state. We cannot + // use node->unescaped for this because regexp unescaped content preserves + // escape text (e.g., \x80 is 4 ASCII chars), not the binary values. + bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0; + // Use the unescaped content for error messages to match CRuby's format, + // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes + // like \u{80} are preserved as text. + const char *error_source = (const char *) pm_string_source(&node->unescaped); + int error_source_length = (int) pm_string_length(&node->unescaped); + pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(®exp_parser, ascii_only, flags, error_source, error_source_length); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); + + // Second pass: walk unescaped content for named capture extraction. + if (name_callback != NULL) { + bool shared = node->unescaped.type == PM_STRING_SHARED; + pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data); + } + + return encoding_flags; +} + +/** + * Parse an interpolated regular expression for named capture groups only. + * This is used for the =~ operator with interpolated regexps where we don't + * have a pm_regular_expression_node_t. No encoding validation is performed. + * + * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.) + * are initialized but not used for the result. They exist because the parsing + * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update + * them as they walk through the content. + */ +void +pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + pm_regexp_parser_t regexp_parser = { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = name_callback, + .name_data = name_data, + .shared = shared, + .node_start = source, + .node_end = source + size, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); +} |
