summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKevin Newton <kddnewton@gmail.com>2024-02-18 16:36:16 -0500
committergit <svn-admin@ruby-lang.org>2024-02-18 21:42:09 +0000
commitec1eda7b6270fc433682c2e705381bb7959c7195 (patch)
tree14791ddd5046900887d7ed6d3a3076dca49e154f
parentea529dd409a7edcaa2c6154a315ec6766b98459d (diff)
[ruby/prism] Account for encoding in regexp named captures
https://github.com/ruby/prism/commit/17dc6b6281
-rw-r--r--prism/regexp.c19
-rw-r--r--test/prism/fixtures/regex_char_width.txt3
-rw-r--r--test/prism/parser_test.rb1
-rw-r--r--test/prism/ruby_parser_test.rb1
-rw-r--r--test/prism/snapshots/regex_char_width.txt50
5 files changed, 72 insertions, 2 deletions
diff --git a/prism/regexp.c b/prism/regexp.c
index ba498ecc83..6e0fdd295c 100644
--- a/prism/regexp.c
+++ b/prism/regexp.c
@@ -565,21 +565,36 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
*/
static bool
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
- switch (*parser->cursor++) {
+ switch (*parser->cursor) {
case '^':
case '$':
+ parser->cursor++;
return true;
case '\\':
+ parser->cursor++;
if (!pm_regexp_char_is_eof(parser)) {
parser->cursor++;
}
return pm_regexp_parse_quantifier(parser);
case '(':
+ parser->cursor++;
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
case '[':
+ parser->cursor++;
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
- default:
+ default: {
+ size_t width;
+ if (!parser->encoding_changed) {
+ width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+ } else {
+ width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
+ }
+
+ if (width == 0) return false; // TODO: add appropriate error
+ parser->cursor += width;
+
return pm_regexp_parse_quantifier(parser);
+ }
}
}
diff --git a/test/prism/fixtures/regex_char_width.txt b/test/prism/fixtures/regex_char_width.txt
new file mode 100644
index 0000000000..7096b71584
--- /dev/null
+++ b/test/prism/fixtures/regex_char_width.txt
@@ -0,0 +1,3 @@
+# encoding: sjis
+/Ⅷ(?<a>.)Ⅹ(?<b>.)/ =~ 'ⅧaⅩb'
+[a, b]
diff --git a/test/prism/parser_test.rb b/test/prism/parser_test.rb
index 26cc2f5b97..ad06af4359 100644
--- a/test/prism/parser_test.rb
+++ b/test/prism/parser_test.rb
@@ -55,6 +55,7 @@ module Prism
dos_endings.txt
heredocs_with_ignored_newlines.txt
regex.txt
+ regex_char_width.txt
spanning_heredoc.txt
spanning_heredoc_newlines.txt
tilde_heredocs.txt
diff --git a/test/prism/ruby_parser_test.rb b/test/prism/ruby_parser_test.rb
index f89aa4c23e..a71d05e78c 100644
--- a/test/prism/ruby_parser_test.rb
+++ b/test/prism/ruby_parser_test.rb
@@ -30,6 +30,7 @@ module Prism
todos = %w[
heredocs_nested.txt
newline_terminated.txt
+ regex_char_width.txt
seattlerb/bug169.txt
seattlerb/dstr_evstr.txt
seattlerb/heredoc_squiggly_interp.txt
diff --git a/test/prism/snapshots/regex_char_width.txt b/test/prism/snapshots/regex_char_width.txt
new file mode 100644
index 0000000000..6bf2169b2f
--- /dev/null
+++ b/test/prism/snapshots/regex_char_width.txt
@@ -0,0 +1,50 @@
+@ ProgramNode (location: (2,0)-(3,6))
+├── locals: [:a, :b]
+└── statements:
+ @ StatementsNode (location: (2,0)-(3,6))
+ └── body: (length: 2)
+ ├── @ MatchWriteNode (location: (2,0)-(2,36))
+ │ ├── call:
+ │ │ @ CallNode (location: (2,0)-(2,36))
+ │ │ ├── flags: ∅
+ │ │ ├── receiver:
+ │ │ │ @ RegularExpressionNode (location: (2,0)-(2,22))
+ │ │ │ ├── flags: ∅
+ │ │ │ ├── opening_loc: (2,0)-(2,1) = "/"
+ │ │ │ ├── content_loc: (2,1)-(2,21) = "\x{E285}\xA7(?<a>.)\x{E285}\xA9(?<b>.)"
+ │ │ │ ├── closing_loc: (2,21)-(2,22) = "/"
+ │ │ │ └── unescaped: "\x{E285}\xA7(?<a>.)\x{E285}\xA9(?<b>.)"
+ │ │ ├── call_operator_loc: ∅
+ │ │ ├── name: :=~
+ │ │ ├── message_loc: (2,23)-(2,25) = "=~"
+ │ │ ├── opening_loc: ∅
+ │ │ ├── arguments:
+ │ │ │ @ ArgumentsNode (location: (2,26)-(2,36))
+ │ │ │ ├── flags: ∅
+ │ │ │ └── arguments: (length: 1)
+ │ │ │ └── @ StringNode (location: (2,26)-(2,36))
+ │ │ │ ├── flags: ∅
+ │ │ │ ├── opening_loc: (2,26)-(2,27) = "'"
+ │ │ │ ├── content_loc: (2,27)-(2,35) = "\x{E285}\xA7a\x{E285}\xA9b"
+ │ │ │ ├── closing_loc: (2,35)-(2,36) = "'"
+ │ │ │ └── unescaped: "\x{E285}\xA7a\x{E285}\xA9b"
+ │ │ ├── closing_loc: ∅
+ │ │ └── block: ∅
+ │ └── targets: (length: 2)
+ │ ├── @ LocalVariableTargetNode (location: (2,7)-(2,8))
+ │ │ ├── name: :a
+ │ │ └── depth: 0
+ │ └── @ LocalVariableTargetNode (location: (2,17)-(2,18))
+ │ ├── name: :b
+ │ └── depth: 0
+ └── @ ArrayNode (location: (3,0)-(3,6))
+ ├── flags: ∅
+ ├── elements: (length: 2)
+ │ ├── @ LocalVariableReadNode (location: (3,1)-(3,2))
+ │ │ ├── name: :a
+ │ │ └── depth: 0
+ │ └── @ LocalVariableReadNode (location: (3,4)-(3,5))
+ │ ├── name: :b
+ │ └── depth: 0
+ ├── opening_loc: (3,0)-(3,1) = "["
+ └── closing_loc: (3,5)-(3,6) = "]"