From 9fdfdf4fca22e892e92ad7060abac48a00516d81 Mon Sep 17 00:00:00 2001 From: Benoit Daloze Date: Wed, 31 Jan 2024 22:19:36 +0100 Subject: [ruby/prism] Always return the character width for char_is_identifier_start() and char_is_identifier_utf8() * This is also faster than calling pm_encoding_utf_8_alpha_char/pm_encoding_utf_8_alnum_char as those compute the character width and do extra checks. https://github.com/ruby/prism/commit/4cb276ac4c --- prism/encoding.c | 5 ++++- prism/encoding.h | 10 ++++++++++ prism/prism.c | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'prism') diff --git a/prism/encoding.c b/prism/encoding.c index 2210d71411..bdb9958e97 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -2277,7 +2277,10 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { return 0; } -static size_t +/** + * Return the size of the next character in the UTF-8 encoding. + */ +size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { size_t width; pm_utf_8_codepoint(b, n, &width); diff --git a/prism/encoding.h b/prism/encoding.h index 8fe01aea69..7ba1695de8 100644 --- a/prism/encoding.h +++ b/prism/encoding.h @@ -79,6 +79,16 @@ typedef struct { */ #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2 +/** + * Return the size of the next character in the UTF-8 encoding. + * + * @param b The bytes to read. + * @param n The number of bytes that can be read. + * @returns The number of bytes that the next character takes if it is valid in + * the encoding, or 0 if it is not. + */ +size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n); + /** * Return the size of the next character in the UTF-8 encoding if it is an * alphabetical character. diff --git a/prism/prism.c b/prism/prism.c index f387d1305f..a1049ceab5 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6249,7 +6249,7 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) { } else if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_'); } else { - return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u); + return pm_encoding_utf_8_char_width(b, parser->end - b); } } @@ -6262,7 +6262,7 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) { if (*b < 0x80) { return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0); } else { - return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u); + return pm_encoding_utf_8_char_width(b, end - b); } } -- cgit v1.2.3