[ruby/prism] Always return the character width for char_is_identifier_start() and char_is_identifier_utf8()

* This is also faster than calling pm_encoding_utf_8_alpha_char/pm_encoding_utf_8_alnum_char as those compute the character width and do extra checks. https://github.com/ruby/prism/commit/4cb276ac4c
author: Benoit Daloze <eregontp@gmail.com> 2024-01-31 22:19:36 +0100
committer: git <svn-admin@ruby-lang.org> 2024-01-31 21:29:16 +0000
commit: 9fdfdf4fca22e892e92ad7060abac48a00516d81 (patch)
tree: ea0f5f85fc45eb9e9c6192cf8003686187503f3b /prism
parent: b5a2c60d0a96664f99a1f39299779305ebb9dcbf (diff)
3 files changed, 16 insertions, 3 deletions
diff --git a/prism/encoding.c b/prism/encoding.c
index 2210d71411..bdb9958e97 100644
--- a/prism/encoding.c
+++ b/prism/encoding.c
@@ -2277,7 +2277,10 @@ pm_utf_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) {
     return 0;
 }
 
-static size_t
+/**
+ * Return the size of the next character in the UTF-8 encoding.
+ */
+size_t
 pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) {
     size_t width;
     pm_utf_8_codepoint(b, n, &width);
diff --git a/prism/encoding.h b/prism/encoding.h
index 8fe01aea69..7ba1695de8 100644
--- a/prism/encoding.h
+++ b/prism/encoding.h
@@ -80,6 +80,16 @@ typedef struct {
 #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
 
 /**
+ * Return the size of the next character in the UTF-8 encoding.
+ *
+ * @param b The bytes to read.
+ * @param n The number of bytes that can be read.
+ * @returns The number of bytes that the next character takes if it is valid in
+ *     the encoding, or 0 if it is not.
+ */
+size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);
+
+/**
  * Return the size of the next character in the UTF-8 encoding if it is an
  * alphabetical character.
  *
diff --git a/prism/prism.c b/prism/prism.c
index f387d1305f..a1049ceab5 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -6249,7 +6249,7 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
     } else if (*b < 0x80) {
         return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT ? 1 : 0) || (*b == '_');
     } else {
-        return (size_t) (pm_encoding_utf_8_alpha_char(b, parser->end - b) || 1u);
+        return pm_encoding_utf_8_char_width(b, parser->end - b);
     }
 }
 
@@ -6262,7 +6262,7 @@ char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
     if (*b < 0x80) {
         return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
     } else {
-        return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
+        return pm_encoding_utf_8_char_width(b, end - b);
     }
 }
author	Benoit Daloze <eregontp@gmail.com>	2024-01-31 22:19:36 +0100
committer	git <svn-admin@ruby-lang.org>	2024-01-31 21:29:16 +0000
commit	9fdfdf4fca22e892e92ad7060abac48a00516d81 (patch)
tree	ea0f5f85fc45eb9e9c6192cf8003686187503f3b /prism
parent	b5a2c60d0a96664f99a1f39299779305ebb9dcbf (diff)