diff options
Diffstat (limited to 'prism/regexp.c')
| -rw-r--r-- | prism/regexp.c | 1015 |
1 files changed, 971 insertions, 44 deletions
diff --git a/prism/regexp.c b/prism/regexp.c index dcc7476244..cc17aa4d09 100644 --- a/prism/regexp.c +++ b/prism/regexp.c @@ -1,5 +1,20 @@ -#include "prism/regexp.h" - +#include "prism/internal/regexp.h" + +#include "prism/compiler/inline.h" +#include "prism/compiler/fallthrough.h" +#include "prism/internal/buffer.h" +#include "prism/internal/char.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/memchr.h" +#include "prism/internal/parser.h" +#include "prism/internal/stringy.h" +#include "prism/internal/strncasecmp.h" + +#include <assert.h> +#include <string.h> + +/** The maximum depth of nested groups allowed in a regular expression. */ #define PM_REGEXP_PARSE_DEPTH_MAX 4096 /** @@ -18,6 +33,54 @@ typedef struct { /** A pointer to the end of the source that we are parsing. */ const uint8_t *end; + /** The encoding of the source. */ + const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + pm_regexp_name_data_t *name_data; + + /** The start of the regexp node (for error locations). */ + const uint8_t *node_start; + + /** The end of the regexp node (for error locations). */ + const uint8_t *node_end; + + /** + * The explicit encoding determined by escape sequences. NULL if no + * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the + * source encoding for `\x` escapes. + */ + const pm_encoding_t *explicit_encoding; + + /** + * Pointer to the first non-POSIX property name (for /n error messages). + * POSIX properties (Alnum, Alpha, etc.) work in all encodings. + * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u. + * Unicode-only properties (L, Ll, etc.) work only in /u. + */ + const uint8_t *property_name; + + /** Length of the first non-POSIX property name found. */ + size_t property_name_length; + + /** + * Pointer to the first Unicode-only property name (for /e, /s error + * messages). NULL if only POSIX or script properties have been seen. + */ + const uint8_t *unicode_property_name; + + /** Length of the first Unicode-only property name found. */ + size_t unicode_property_name_length; + + /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */ + pm_buffer_t hex_escape_buffer; + + /** Count of non-ASCII literal bytes (not from escapes). */ + uint32_t non_ascii_literal_count; + /** * Whether or not the regular expression currently being parsed is in * extended mode, wherein whitespace is ignored and comments are allowed. @@ -27,31 +90,77 @@ typedef struct { /** Whether the encoding has changed from the default. */ bool encoding_changed; - /** The encoding of the source. */ - const pm_encoding_t *encoding; + /** Whether the source content is shared (for named capture callback). */ + bool shared; - /** The callback to call when a named capture group is found. */ - pm_regexp_name_callback_t name_callback; + /** Whether a `\u{...}` escape with value >= 0x80 was seen. */ + bool has_unicode_escape; - /** The data to pass to the name callback. */ - void *name_data; + /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */ + bool has_hex_escape; + + /** + * Tracks whether the last encoding-setting escape was `\u` (true) or `\x` + * (false). This matters for error messages when both types are mixed. + */ + bool last_escape_was_unicode; + + /** Whether any `\p{...}` or `\P{...}` property escape was found. */ + bool has_property_escape; + + /** Whether a Unicode-only property escape was found (not POSIX or script). */ + bool has_unicode_property_escape; - /** The callback to call when a parse error is found. */ - pm_regexp_error_callback_t error_callback; + /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */ + bool invalid_unicode_range; - /** The data to pass to the error callback. */ - void *error_data; + /** Whether we are accumulating consecutive hex escape bytes. */ + bool hex_group_active; + + /** Whether an invalid multibyte character was found during parsing. */ + bool has_invalid_multibyte; } pm_regexp_parser_t; /** - * Append an error to the parser. + * Append a syntax error to the parser's error list. If the source is shared + * (points into the original source), we can point to the exact error location. + * Otherwise, we point to the whole regexp node. */ -static inline void +static PRISM_INLINE void pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { - parser->error_callback(start, end, message, parser->error_data); + pm_parser_t *pm = parser->parser; + uint32_t loc_start, loc_length; + + if (parser->shared) { + loc_start = (uint32_t) (start - pm->start); + loc_length = (uint32_t) (end - start); + } else { + loc_start = (uint32_t) (parser->node_start - pm->start); + loc_length = (uint32_t) (parser->node_end - parser->node_start); + } + + pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message); } /** + * Append a formatted diagnostic error with proper shared/non-shared location + * handling. This is a macro because we need variadic args for the format string. + */ +#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \ + do { \ + pm_parser_t *pm__ = (parser_)->parser; \ + uint32_t loc_start__, loc_length__; \ + if ((parser_)->shared) { \ + loc_start__ = (uint32_t) ((err_start_) - pm__->start); \ + loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \ + } else { \ + loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \ + loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \ + } \ + pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \ + } while (0) + +/** * This appends a new string to the list of named captures. This function * assumes the caller has already checked the validity of the name callback. */ @@ -59,14 +168,14 @@ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { pm_string_t string; pm_string_shared_init(&string, start, end); - parser->name_callback(&string, parser->name_data); - pm_string_free(&string); + parser->name_callback(parser->parser, &string, parser->shared, parser->name_data); + pm_string_cleanup(&string); } /** * Returns true if the next character is the end of the source. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { return parser->cursor >= parser->end; } @@ -74,7 +183,7 @@ pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { /** * Optionally accept a char and consume it if it exists. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { parser->cursor++; @@ -86,7 +195,7 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { /** * Expect a character to be present and consume it. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { parser->cursor++; @@ -114,6 +223,47 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { } /** + * Mark a group boundary in the hex escape byte buffer. When consecutive hex + * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00 + * sentinel to separate the groups for later multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) { + if (parser->hex_group_active) { + pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00); + parser->hex_group_active = false; + } +} + +/** + * Track a hex escape byte value >= 0x80 for multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) { + if (byte >= 0x80) { + pm_buffer_append_byte(&parser->hex_escape_buffer, byte); + parser->hex_group_active = true; + parser->has_hex_escape = true; + + parser->explicit_encoding = parser->encoding; + parser->last_escape_was_unicode = false; + } else { + pm_regexp_hex_group_boundary(parser); + } +} + +/** + * Parse a hex digit character and return its value, or -1 if not a hex digit. + */ +static PRISM_INLINE int +pm_regexp_hex_digit_value(uint8_t byte) { + if (byte >= '0' && byte <= '9') return byte - '0'; + if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10; + if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10; + return -1; +} + +/** * Range quantifiers are a special class of quantifiers that look like * * * {digit} @@ -121,13 +271,12 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { * * {digit,digit} * * {,digit} * - * Unfortunately, if there are any spaces in between, then this just becomes a - * regular character match expression and we have to backtrack. So when this - * function first starts running, we'll create a "save" point and then attempt - * to parse the quantifier. If it fails, we'll restore the save point and - * return. + * If there are any spaces in between, then this just becomes a regular + * character match expression and we have to backtrack. So when this function + * first starts running, we'll create a "save" point and then attempt to parse + * the quantifier. If it fails, we'll restore the save point and return. * - * The properly track everything, we're going to build a little state machine. + * To properly track everything, we're going to build a little state machine. * It looks something like the following: * * +-------+ +---------+ ------------+ @@ -275,11 +424,393 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { ); } +/** + * Property escape classification. Onigmo supports three tiers of property + * names depending on the encoding: + * + * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower, + * Print, Punct, Space, Upper, XDigit, Word): valid in all encodings. + * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid + * in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n). + * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus + * Any, Assigned): valid only in UTF-8 (/u). + */ +typedef enum { + PM_REGEXP_PROPERTY_POSIX, + PM_REGEXP_PROPERTY_SCRIPT, + PM_REGEXP_PROPERTY_UNICODE +} pm_regexp_property_type_t; + +/** + * Classify a property name. The name may start with '^' for negation, which + * is skipped before matching. + */ +static pm_regexp_property_type_t +pm_regexp_classify_property(const uint8_t *name, size_t length) { + // Skip leading '^' for negated properties like \p{^Hiragana}. + if (length > 0 && name[0] == '^') { + name++; + length--; + } + +#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0) + + switch (length) { + case 3: + if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 4: + if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 5: + /* Most properties are length 5, so dispatch on first character. */ + switch (name[0] | 0x20) { + case 'a': + if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'b': + if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'c': + if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'd': + if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'g': + if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'l': + if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'p': + if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 's': + if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'u': + if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX; + break; + } + break; + case 6: + if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 8: + if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + } + +#undef PM_REGEXP_CASECMP + + // Everything else is Unicode-only (general categories, other scripts, etc.). + return PM_REGEXP_PROPERTY_UNICODE; +} + +/** + * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The + * cursor should be pointing at 'p' or 'P' when this is called. If a property + * escape is found, record it on the regexp parser and advance past the closing + * '}'. + * + * Properties are classified into three tiers (POSIX, script, Unicode-only) to + * determine which encoding modifiers they are valid with. + */ +static bool +pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) { + assert(*parser->cursor == 'p' || *parser->cursor == 'P'); + + if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') { + const uint8_t *name_start = parser->cursor + 2; + const uint8_t *search = name_start; + + while (search < parser->end && *search != '}') search++; + + if (search < parser->end) { + size_t name_length = (size_t) (search - name_start); + parser->has_property_escape = true; + + pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length); + + // Track the first non-POSIX property name (for /n error messages). + if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) { + parser->property_name = name_start; + parser->property_name_length = name_length; + } + + // Track the first Unicode-only property name (for /e, /s error messages). + if (type == PM_REGEXP_PROPERTY_UNICODE) { + parser->has_unicode_property_escape = true; + if (parser->unicode_property_name == NULL) { + parser->unicode_property_name = name_start; + parser->unicode_property_name_length = name_length; + } + } + + parser->cursor = search + 1; // skip past '}' + return true; + } + } + + // Not a property escape, just skip the single character after '\'. + parser->cursor++; + return false; +} + +/** + * Validate and skip a \u escape sequence in a regular expression. The cursor + * should be pointing at the character after 'u' when this is called. This + * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding + * state for validation. + */ +static void +pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) { + const uint8_t *escape_start = parser->cursor - 2; // points to '\' + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + return; + } + + if (*parser->cursor == '{') { + parser->cursor++; // skip '{' + + // Skip leading whitespace. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + + bool has_codepoint = false; + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + // Parse the hex digits to compute the codepoint value. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count == 0) { + // Skip to '}' or end of regexp to find the full extent. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + parser->cursor++; + } + + const uint8_t *escape_end = parser->cursor; + if (!pm_regexp_char_is_eof(parser)) { + escape_end++; + parser->cursor++; // skip '}' + } + + pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start); + return; + } + + if (hex_count > 6) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range"); + } + + // Track encoding state for this codepoint. + if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range (surrogates or > 0x10FFFF). + if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) { + parser->invalid_unicode_range = true; + } + + has_codepoint = true; + + // Skip whitespace between codepoints. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + } + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape"); + } else { + if (!has_codepoint) { + pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start); + } + parser->cursor++; // skip '}' + } + } else { + // \uNNNN form — need exactly 4 hex digits. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count < 4) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + } else if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range. + if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) { + parser->invalid_unicode_range = true; + } + } +} + // Forward declaration because character sets can be nested. static bool pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); /** + * Parse a \x escape and return the byte value. The cursor should be pointing + * at the character after 'x'. Returns -1 if no hex digits follow. + */ +static int +pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) { + int value = -1; + + if (!pm_regexp_char_is_eof(parser)) { + int digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = digit; + parser->cursor++; + + if (!pm_regexp_char_is_eof(parser)) { + digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = (value << 4) | digit; + parser->cursor++; + } + } + } + } + + if (value >= 0) { + pm_regexp_track_hex_escape(parser, (uint8_t) value); + } + + return value; +} + +/** + * Parse a backslash escape sequence in a regexp, handling \u (unicode), + * \p/\P (property), \x (hex), and other single-character escapes. Also + * tracks encoding state for \M-x and \C-\M-x escapes. + */ +static void +pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) { + if (pm_regexp_char_is_eof(parser)) return; + + switch (*parser->cursor) { + case 'u': + parser->cursor++; // skip 'u' + pm_regexp_parse_unicode_escape(parser); + break; + case 'p': + case 'P': + pm_regexp_parse_property_escape(parser); + break; + case 'x': + parser->cursor++; // skip 'x' + pm_regexp_parse_hex_escape(parser); + break; + case 'M': + // \M-x produces (x | 0x80), always >= 0x80 + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'M-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80 + // We just need to track it as a hex escape >= 0x80. + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + // \M-x always produces a byte >= 0x80 + pm_regexp_track_hex_escape(parser, 0x80); + } + } else { + parser->cursor++; + } + break; + case 'C': + // \C-x produces (x & 0x1F) + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'C-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + } else { + parser->cursor++; + } + break; + case 'c': + // \cx produces (x & 0x1F) + parser->cursor++; // skip 'c' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + break; + default: + pm_regexp_hex_group_boundary(parser); + parser->cursor++; + break; + } +} + +/** + * Check if a byte at the current position is a non-ASCII byte in a multibyte + * encoding that produces an invalid character. If so, emit an error at the + * byte location immediately. + */ +static void +pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) { + uint8_t byte = *cursor; + if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) { + size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor)); + if (width > 1) { + parser->cursor += width - 1; + } else if (width == 0) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } +} + +/** * match-char-set : '[' '^'? (match-range | match-char)* ']' * ; */ @@ -293,12 +824,16 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1)); break; case '\\': - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); break; default: - // do nothing, we've already advanced the cursor + // We've already advanced the cursor by one byte. If the byte + // was >= 0x80 in a multibyte encoding, we may need to consume + // additional continuation bytes and validate the character. + if (*(parser->cursor - 1) >= 0x80) { + parser->non_ascii_literal_count++; + } + pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1); break; } } @@ -354,8 +889,13 @@ typedef enum { // These are the options that are configurable on the regular expression (or // from within a group). +/** The minimum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' + +/** The maximum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' + +/** The number of regexp option slots. */ #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) /** @@ -498,7 +1038,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { } size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); - if (width == 0) return false; + if (width == 0) { + if (*parser->cursor >= 0x80) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + parser->cursor++; + continue; + } + return false; + } escaped = (width == 1) && (*parser->cursor == '\\'); parser->cursor += width; @@ -686,9 +1234,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { return pm_regexp_parse_quantifier(parser); case '\\': parser->cursor++; - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); return pm_regexp_parse_quantifier(parser); case '(': parser->cursor++; @@ -720,9 +1266,30 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); } - if (width == 0) return false; // TODO: add appropriate error - parser->cursor += width; + if (width == 0) { + if (*parser->cursor >= 0x80 && parser->encoding_changed) { + if (parser->encoding->multibyte) { + // Invalid multibyte character in a multibyte encoding. + // Emit the error at the byte location immediately. + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } else { + // Non-ASCII byte in a single-byte encoding (e.g., + // US-ASCII). Count it for later error reporting. + parser->non_ascii_literal_count++; + } + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + } + return false; + } + + // Count non-ASCII literal bytes. + for (size_t i = 0; i < width; i++) { + if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++; + } + parser->cursor += width; return pm_regexp_parse_quantifier(parser); } } @@ -768,13 +1335,354 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { return pm_regexp_char_is_eof(parser); } +// --------------------------------------------------------------------------- +// Encoding validation +// --------------------------------------------------------------------------- + /** - * Parse a regular expression and extract the names of all of the named capture - * groups. + * Validate that groups of hex escape bytes in the buffer form valid multibyte + * characters in the given encoding. Groups are separated by 0x00 sentinels. + */ +static bool +pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) { + const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer); + size_t len = pm_buffer_length(buffer); + size_t i = 0; + + while (i < len) { + size_t group_start = i; + while (i < len && data[i] != 0x00) i++; + + for (size_t j = group_start; j < i; ) { + size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j)); + if (width == 0) return false; + j += width; + } + + if (i < len) i++; // skip sentinel + } + + return true; +} + +/** + * Format regexp source content for use in error messages, hex-escaping + * non-ASCII bytes. + */ +static void +pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) { + size_t index = 0; + + if (encoding == PM_ENCODING_UTF_8_ENTRY) { + pm_buffer_append_string(buffer, (const char *) source, length); + return; + } + + while (index < length) { + if (source[index] < 0x80) { + pm_buffer_append_byte(buffer, source[index]); + index++; + } else if (encoding->multibyte) { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index)); + + if (width > 1) { + pm_buffer_append_string(buffer, "\\x{", 3); + for (size_t i = 0; i < width; i++) { + pm_buffer_append_format(buffer, "%02X", source[index + i]); + } + pm_buffer_append_byte(buffer, '}'); + index += width; + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } +} + +/** + * Emit an encoding validation error on the regexp node. + */ +#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \ + pm_diagnostic_list_append_format( \ + &(parser)->parser->metadata_arena, \ + &(parser)->parser->error_list, \ + (uint32_t) ((parser)->node_start - (parser)->parser->start), \ + (uint32_t) ((parser)->node_end - (parser)->node_start), \ + diag_id, __VA_ARGS__) + +/** + * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n). + * + * The decision tree is: + * + * 1. No escape-set encoding (explicit_encoding == NULL): + * a. ASCII-only content: validate property escapes, return forced US-ASCII + * for /n or the modifier flags for others. + * b. US-ASCII source with non-ASCII literals: emit per-byte errors. + * c. Source encoding differs from modifier encoding: emit mismatch error. + * + * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending + * on the modifier and which escape type was last. + * + * 3. \u escape with non-/u modifier: incompatible encoding error. + * + * 4. Validate that hex escape byte sequences form valid multibyte characters + * in the modifier's encoding. + */ +static pm_node_flags_t +pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) { + + if (parser->explicit_encoding == NULL) { + if (ascii_only) { + // Check property escapes against the modifier's encoding tier. + // /n (ASCII-8BIT): only POSIX properties are valid. + // /e, /s: POSIX and script properties are valid. + // /u: all properties are valid. + if (modifier == 'n' && parser->property_name != NULL) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->property_name_length, (const char *) parser->property_name, + source_length, source_start); + } else if (modifier != 'u' && parser->has_unicode_property_escape) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name, + source_length, source_start); + } + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + pm_buffer_t formatted = { 0 }; + pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length); + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value); + pm_buffer_cleanup(&formatted); + } + } + + return flags; + } + + // Mixed unicode + hex escapes. + if (parser->has_unicode_escape && parser->has_hex_escape) { + if (modifier == 'n') { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + } else { + if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + + return flags; + } + + if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } + } + + if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return flags; +} + +/** + * Validate encoding for a regexp without a modifier and compute the encoding + * flags to set on the node. + * + * The decision tree is: + * + * 1. If a modifier (/n, /u, /e, /s) is present, delegate to + * pm_regexp_validate_encoding_modifier. + * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors + * were already emitted during parsing). + * 3. US-ASCII source with non-ASCII literals: emit per-byte errors. + * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}). + * 5. Escape-set encoding present: validate hex escapes against the target + * encoding, handle mixed \u + \x conflicts, and return the appropriate + * forced encoding flag. + */ +static pm_node_flags_t +pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) { + + // Invalid multibyte characters suppress further validation. + // Errors were already emitted at the byte locations during parsing. + if (parser->has_invalid_multibyte) { + return flags; + } + + if (parser->invalid_unicode_range) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start); + return flags; + } + + // Check modifier flags first. + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length); + } + + // No modifier — check for non-ASCII literals in US-ASCII encoding. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } + + // ASCII-only regexps get downgraded to US-ASCII, unless property escapes + // force UTF-8. + if (ascii_only) { + if (parser->has_property_escape) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // Check explicit encoding from escape sequences. + if (parser->explicit_encoding != NULL) { + // Mixed unicode + hex escapes without modifier. + if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY && + parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY && + !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } else if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + + return 0; + } + + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } else { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + } + + return 0; +} + +/** + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Encoding validation walks the raw source (content_loc) + * to distinguish escape-produced bytes from literal bytes. Named capture + * extraction walks the unescaped content since escape sequences in group names + * (e.g., line continuations) have already been processed by the lexer. + */ +pm_node_flags_t +pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + const uint8_t *source = parser->start + node->content_loc.start; + size_t size = node->content_loc.length; + bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED); + pm_node_flags_t flags = PM_NODE_FLAGS(node); + + const uint8_t *node_start = parser->start + node->base.location.start; + const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length; + + // First pass: walk raw source for encoding validation (no name extraction). + pm_regexp_parser_t regexp_parser = { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = NULL, + .name_data = NULL, + .shared = true, + .node_start = node_start, + .node_end = node_end, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + + // Compute ascii_only from the regexp parser's tracked state. We cannot + // use node->unescaped for this because regexp unescaped content preserves + // escape text (e.g., \x80 is 4 ASCII chars), not the binary values. + bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0; + // Use the unescaped content for error messages to match CRuby's format, + // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes + // like \u{80} are preserved as text. + const char *error_source = (const char *) pm_string_source(&node->unescaped); + int error_source_length = (int) pm_string_length(&node->unescaped); + pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(®exp_parser, ascii_only, flags, error_source, error_source_length); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); + + // Second pass: walk unescaped content for named capture extraction. + if (name_callback != NULL) { + bool shared = node->unescaped.type == PM_STRING_SHARED; + pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data); + } + + return encoding_flags; +} + +/** + * Parse an interpolated regular expression for named capture groups only. + * This is used for the =~ operator with interpolated regexps where we don't + * have a pm_regular_expression_node_t. No encoding validation is performed. + * + * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.) + * are initialized but not used for the result. They exist because the parsing + * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update + * them as they walk through the content. */ -PRISM_EXPORTED_FUNCTION void -pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { - pm_regexp_parse_pattern(&(pm_regexp_parser_t) { +void +pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + pm_regexp_parser_t regexp_parser = { .parser = parser, .start = source, .cursor = source, @@ -784,7 +1692,26 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool ex .encoding = parser->encoding, .name_callback = name_callback, .name_data = name_data, - .error_callback = error_callback, - .error_data = error_data - }); + .shared = shared, + .node_start = source, + .node_end = source + size, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); } |
