diff options
| author | Kevin Newton <kddnewton@gmail.com> | 2025-01-07 22:49:46 -0500 |
|---|---|---|
| committer | git <svn-admin@ruby-lang.org> | 2025-01-08 20:36:06 +0000 |
| commit | c4534c9fe88259de9509e4b75fb6afc31a1f00a7 (patch) | |
| tree | 13c7e7fb7647152821446e6f0005ae3129f758ad | |
| parent | 36b6625ba9ccaa0ab88dd56b3f41c70e161f3df0 (diff) | |
[ruby/prism] Handle escapes in named capture names
https://github.com/ruby/prism/commit/b4b7a69ce7
| -rw-r--r-- | prism/prism.c | 161 | ||||
| -rw-r--r-- | prism/util/pm_buffer.c | 40 | ||||
| -rw-r--r-- | prism/util/pm_buffer.h | 10 | ||||
| -rw-r--r-- | test/prism/result/named_capture_test.rb | 29 |
4 files changed, 223 insertions, 17 deletions
diff --git a/prism/prism.c b/prism/prism.c index 19ff1d2fe6..6e3b41100b 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -9551,21 +9551,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; } - if (value <= 0x7F) { // 0xxxxxxx - pm_buffer_append_byte(buffer, (uint8_t) value); - } else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx - pm_buffer_append_byte(buffer, (uint8_t) (0xC0 | (value >> 6))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F))); - } else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx - pm_buffer_append_byte(buffer, (uint8_t) (0xE0 | (value >> 12))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F))); - } else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - pm_buffer_append_byte(buffer, (uint8_t) (0xF0 | (value >> 18))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F))); - pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F))); - } else { + if (!pm_buffer_append_unicode_codepoint(buffer, value)) { pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE); pm_buffer_append_byte(buffer, 0xEF); pm_buffer_append_byte(buffer, 0xBF); @@ -20873,6 +20859,123 @@ typedef struct { bool shared; } parse_regular_expression_named_capture_data_t; +static inline const uint8_t * +pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { + cursor++; + + if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) { + uint8_t value = escape_hexadecimal_digit(*cursor); + cursor++; + + if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) { + value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(*cursor)); + cursor++; + } + + pm_buffer_append_byte(unescaped, value); + } else { + pm_buffer_append_string(unescaped, "\\x", 2); + } + + return cursor; +} + +static inline const uint8_t * +pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { + uint8_t value = (uint8_t) (*cursor - '0'); + cursor++; + + if (cursor < end && pm_char_is_octal_digit(*cursor)) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0')); + cursor++; + + if (cursor < end && pm_char_is_octal_digit(*cursor)) { + value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0')); + cursor++; + } + } + + pm_buffer_append_byte(unescaped, value); + return cursor; +} + +static inline const uint8_t * +pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { + const uint8_t *start = cursor - 1; + cursor++; + + if (cursor >= end) { + pm_buffer_append_string(unescaped, "\\u", 2); + return cursor; + } + + if (*cursor != '{') { + size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4)); + uint32_t value = escape_unicode(parser, cursor, length); + + if (!pm_buffer_append_unicode_codepoint(unescaped, value)) { + pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start)); + } + + return cursor + length; + } + + cursor++; + for (;;) { + while (cursor < end && *cursor == ' ') cursor++; + + if (cursor >= end) break; + if (*cursor == '}') { + cursor++; + break; + } + + size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor); + uint32_t value = escape_unicode(parser, cursor, length); + + (void) pm_buffer_append_unicode_codepoint(unescaped, value); + cursor += length; + } + + return cursor; +} + +static void +pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) { + const uint8_t *end = source + length; + pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source)); + + for (;;) { + if (++cursor >= end) { + pm_buffer_append_byte(unescaped, '\\'); + return; + } + + switch (*cursor) { + case 'x': + cursor = pm_named_capture_escape_hex(unescaped, cursor, end); + break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': + cursor = pm_named_capture_escape_octal(unescaped, cursor, end); + break; + case 'u': + cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end); + break; + default: + pm_buffer_append_byte(unescaped, '\\'); + break; + } + + const uint8_t *next_cursor = pm_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, parser->encoding); + if (next_cursor == NULL) break; + + pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (next_cursor - cursor)); + cursor = next_cursor; + } + + pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (end - cursor)); +} + /** * This callback is called when the regular expression parser encounters a named * capture group. @@ -20887,13 +20990,32 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { const uint8_t *source = pm_string_source(capture); size_t length = pm_string_length(capture); + pm_buffer_t unescaped = { 0 }; + + // First, we need to handle escapes within the name of the capture group. + // This is because regular expressions have three different representations + // in prism. The first is the plain source code. The second is the + // representation that will be sent to the regular expression engine, which + // is the value of the "unescaped" field. This is poorly named, because it + // actually still contains escapes, just a subset of them that the regular + // expression engine knows how to handle. The third representation is fully + // unescaped, which is what we need. + const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding); + if (PRISM_UNLIKELY(cursor != NULL)) { + pm_named_capture_escape(parser, &unescaped, source, length, cursor); + source = (const uint8_t *) pm_buffer_value(&unescaped); + length = pm_buffer_length(&unescaped); + } pm_location_t location; pm_constant_id_t name; // If the name of the capture group isn't a valid identifier, we do // not add it to the local table. - if (!pm_slice_is_valid_local(parser, source, source + length)) return; + if (!pm_slice_is_valid_local(parser, source, source + length)) { + pm_buffer_free(&unescaped); + return; + } if (callback_data->shared) { // If the unescaped string is a slice of the source, then we can @@ -20921,7 +21043,10 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) { // If the local is not already a local but it is a keyword, then we // do not want to add a capture for this. - if (pm_local_is_keyword((const char *) source, length)) return; + if (pm_local_is_keyword((const char *) source, length)) { + pm_buffer_free(&unescaped); + return; + } // If the identifier is not already a local, then we will add it to // the local table. @@ -20939,6 +21064,8 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth); pm_node_list_append(&callback_data->match->targets, target); } + + pm_buffer_free(&unescaped); } /** diff --git a/prism/util/pm_buffer.c b/prism/util/pm_buffer.c index 1e79e46718..2136a7c43e 100644 --- a/prism/util/pm_buffer.c +++ b/prism/util/pm_buffer.c @@ -173,6 +173,46 @@ pm_buffer_append_double(pm_buffer_t *buffer, double value) { } /** + * Append a unicode codepoint to the buffer. + */ +bool +pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value) { + if (value <= 0x7F) { + pm_buffer_append_byte(buffer, (uint8_t) value); // 0xxxxxxx + return true; + } else if (value <= 0x7FF) { + uint8_t bytes[] = { + (uint8_t) (0xC0 | ((value >> 6) & 0x3F)), // 110xxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 2); + return true; + } else if (value <= 0xFFFF) { + uint8_t bytes[] = { + (uint8_t) (0xE0 | ((value >> 12) & 0x3F)), // 1110xxxx + (uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 3); + return true; + } else if (value <= 0x10FFFF) { + uint8_t bytes[] = { + (uint8_t) (0xF0 | ((value >> 18) & 0x3F)), // 11110xxx + (uint8_t) (0x80 | ((value >> 12) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx + (uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx + }; + + pm_buffer_append_bytes(buffer, bytes, 4); + return true; + } else { + return false; + } +} + +/** * Append a slice of source code to the buffer. */ void diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h index 58f7b506cf..f3c20ab2a5 100644 --- a/prism/util/pm_buffer.h +++ b/prism/util/pm_buffer.h @@ -138,6 +138,16 @@ void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); void pm_buffer_append_double(pm_buffer_t *buffer, double value); /** + * Append a unicode codepoint to the buffer. + * + * @param buffer The buffer to append to. + * @param value The character to append. + * @returns True if the codepoint was valid and appended successfully, false + * otherwise. + */ +bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value); + +/** * The different types of escaping that can be performed by the buffer when * appending a slice of Ruby source code. */ diff --git a/test/prism/result/named_capture_test.rb b/test/prism/result/named_capture_test.rb new file mode 100644 index 0000000000..36cb910899 --- /dev/null +++ b/test/prism/result/named_capture_test.rb @@ -0,0 +1,29 @@ +# frozen_string_literal: true + +require_relative "../test_helper" + +module Prism + class NamedCaptureTest < TestCase + def test_hex_escapes + assert_equal :😀, parse_name("\\xf0\\x9f\\x98\\x80") + end + + def test_unicode_escape + assert_equal :し, parse_name("\\u3057") + end + + def test_unicode_escapes_bracess + assert_equal :😀, parse_name("\\u{1f600}") + end + + def test_octal_escapes + assert_equal :😀, parse_name("\\xf0\\x9f\\x98\\200") + end + + private + + def parse_name(content) + Prism.parse_statement("/(?<#{content}>)/ =~ ''").targets.first.name + end + end +end |
