[ruby/prism] Account for encoding in regexp named captures

https://github.com/ruby/prism/commit/17dc6b6281
author: Kevin Newton <kddnewton@gmail.com> 2024-02-18 16:36:16 -0500
committer: git <svn-admin@ruby-lang.org> 2024-02-18 21:42:09 +0000
commit: ec1eda7b6270fc433682c2e705381bb7959c7195 (patch)
tree: 14791ddd5046900887d7ed6d3a3076dca49e154f
parent: ea529dd409a7edcaa2c6154a315ec6766b98459d (diff)
5 files changed, 72 insertions, 2 deletions
diff --git a/prism/regexp.c b/prism/regexp.c
index ba498ecc83..6e0fdd295c 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -565,21 +565,36 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
  */
 static bool
 pm_regexp_parse_item(pm_regexp_parser_t *parser) {
-    switch (*parser->cursor++) {
+    switch (*parser->cursor) {
         case '^':
         case '$':
+            parser->cursor++;
             return true;
         case '\\':
+            parser->cursor++;
             if (!pm_regexp_char_is_eof(parser)) {
                 parser->cursor++;
             }
             return pm_regexp_parse_quantifier(parser);
         case '(':
+            parser->cursor++;
             return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
         case '[':
+            parser->cursor++;
             return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
-        default:
+        default: {
+            size_t width;
+            if (!parser->encoding_changed) {
+                width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+            } else {
+                width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+            }
+
+            if (width == 0) return false; // TODO: add appropriate error
+            parser->cursor += width;
+
             return pm_regexp_parse_quantifier(parser);
+        }
     }
 }
 
diff --git a/test/prism/fixtures/regex_char_width.txt b/test/prism/fixtures/regex_char_width.txt
new file mode 100644
index 0000000000..7096b71584
--- /dev/null
+++ b/test/prism/fixtures/regex_char_width.txt
@@ -0,0 +1,3 @@
+# encoding: sjis
+/Ⅷ(?<a>.)Ⅹ(?<b>.)/ =~ 'ⅧaⅩb'
+[a, b]
diff --git a/test/prism/parser_test.rb b/test/prism/parser_test.rb
index 26cc2f5b97..ad06af4359 100644
--- a/test/prism/parser_test.rb
+++ b/test/prism/parser_test.rb
@@ -55,6 +55,7 @@ module Prism
       dos_endings.txt
       heredocs_with_ignored_newlines.txt
       regex.txt
+      regex_char_width.txt
       spanning_heredoc.txt
       spanning_heredoc_newlines.txt
       tilde_heredocs.txt
diff --git a/test/prism/ruby_parser_test.rb b/test/prism/ruby_parser_test.rb
index f89aa4c23e..a71d05e78c 100644
--- a/test/prism/ruby_parser_test.rb
+++ b/test/prism/ruby_parser_test.rb
@@ -30,6 +30,7 @@ module Prism
     todos = %w[
       heredocs_nested.txt
       newline_terminated.txt
+      regex_char_width.txt
       seattlerb/bug169.txt
       seattlerb/dstr_evstr.txt
       seattlerb/heredoc_squiggly_interp.txt
diff --git a/test/prism/snapshots/regex_char_width.txt b/test/prism/snapshots/regex_char_width.txt
new file mode 100644
index 0000000000..6bf2169b2f
--- /dev/null
+++ b/test/prism/snapshots/regex_char_width.txt
@@ -0,0 +1,50 @@
+@ ProgramNode (location: (2,0)-(3,6))
+├── locals: [:a, :b]
+└── statements:
+    @ StatementsNode (location: (2,0)-(3,6))
+    └── body: (length: 2)
+        ├── @ MatchWriteNode (location: (2,0)-(2,36))
+        │   ├── call:
+        │   │   @ CallNode (location: (2,0)-(2,36))
+        │   │   ├── flags: ∅
+        │   │   ├── receiver:
+        │   │   │   @ RegularExpressionNode (location: (2,0)-(2,22))
+        │   │   │   ├── flags: ∅
+        │   │   │   ├── opening_loc: (2,0)-(2,1) = "/"
+        │   │   │   ├── content_loc: (2,1)-(2,21) = "\x{E285}\xA7(?<a>.)\x{E285}\xA9(?<b>.)"
+        │   │   │   ├── closing_loc: (2,21)-(2,22) = "/"
+        │   │   │   └── unescaped: "\x{E285}\xA7(?<a>.)\x{E285}\xA9(?<b>.)"
+        │   │   ├── call_operator_loc: ∅
+        │   │   ├── name: :=~
+        │   │   ├── message_loc: (2,23)-(2,25) = "=~"
+        │   │   ├── opening_loc: ∅
+        │   │   ├── arguments:
+        │   │   │   @ ArgumentsNode (location: (2,26)-(2,36))
+        │   │   │   ├── flags: ∅
+        │   │   │   └── arguments: (length: 1)
+        │   │   │       └── @ StringNode (location: (2,26)-(2,36))
+        │   │   │           ├── flags: ∅
+        │   │   │           ├── opening_loc: (2,26)-(2,27) = "'"
+        │   │   │           ├── content_loc: (2,27)-(2,35) = "\x{E285}\xA7a\x{E285}\xA9b"
+        │   │   │           ├── closing_loc: (2,35)-(2,36) = "'"
+        │   │   │           └── unescaped: "\x{E285}\xA7a\x{E285}\xA9b"
+        │   │   ├── closing_loc: ∅
+        │   │   └── block: ∅
+        │   └── targets: (length: 2)
+        │       ├── @ LocalVariableTargetNode (location: (2,7)-(2,8))
+        │       │   ├── name: :a
+        │       │   └── depth: 0
+        │       └── @ LocalVariableTargetNode (location: (2,17)-(2,18))
+        │           ├── name: :b
+        │           └── depth: 0
+        └── @ ArrayNode (location: (3,0)-(3,6))
+            ├── flags: ∅
+            ├── elements: (length: 2)
+            │   ├── @ LocalVariableReadNode (location: (3,1)-(3,2))
+            │   │   ├── name: :a
+            │   │   └── depth: 0
+            │   └── @ LocalVariableReadNode (location: (3,4)-(3,5))
+            │       ├── name: :b
+            │       └── depth: 0
+            ├── opening_loc: (3,0)-(3,1) = "["
+            └── closing_loc: (3,5)-(3,6) = "]"
author	Kevin Newton <kddnewton@gmail.com>	2024-02-18 16:36:16 -0500
committer	git <svn-admin@ruby-lang.org>	2024-02-18 21:42:09 +0000
commit	ec1eda7b6270fc433682c2e705381bb7959c7195 (patch)
tree	14791ddd5046900887d7ed6d3a3076dca49e154f
parent	ea529dd409a7edcaa2c6154a315ec6766b98459d (diff)