[ruby/prism] Properly handle unescaping in regexp

https://github.com/ruby/prism/commit/abf9fd6863
author: Kevin Newton <kddnewton@gmail.com> 2023-10-12 08:46:40 -0400
committer: Kevin Newton <kddnewton@gmail.com> 2023-10-13 15:31:30 -0400
commit: fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch)
tree: bf98c1898db99a2d35aa759c98dfb259f02055a5 /test
parent: e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff)
12 files changed, 63 insertions, 53 deletions
diff --git a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
index e158069bb6..e9bb768383 100644
--- a/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
+++ b/test/prism/snapshots/heredoc_with_escaped_newline_at_start.txt
@@ -21,7 +21,7 @@
         │   │       │   ├── opening_loc: (1,15)-(1,16) = "/"
         │   │       │   ├── content_loc: (1,16)-(1,20) = "^\\s{"
         │   │       │   ├── closing_loc: (1,20)-(1,21) = "/"
-        │   │       │   ├── unescaped: "^ {"
+        │   │       │   ├── unescaped: "^\\s{"
         │   │       │   └── flags: ∅
         │   │       └── @ StringNode (location: (1,23)-(1,25))
         │   │           ├── flags: ∅
@@ -51,7 +51,7 @@
             │       │   ├── opening_loc: (5,15)-(5,16) = "/"
             │       │   ├── content_loc: (5,16)-(5,20) = "^\\s{"
             │       │   ├── closing_loc: (5,20)-(5,21) = "/"
-            │       │   ├── unescaped: "^ {"
+            │       │   ├── unescaped: "^\\s{"
             │       │   └── flags: ∅
             │       └── @ StringNode (location: (5,23)-(5,25))
             │           ├── flags: ∅
diff --git a/test/prism/snapshots/regex.txt b/test/prism/snapshots/regex.txt
index ff0e3d3b56..5fa07265a3 100644
--- a/test/prism/snapshots/regex.txt
+++ b/test/prism/snapshots/regex.txt
@@ -31,7 +31,7 @@
         │   ├── opening_loc: (5,0)-(5,1) = "/"
         │   ├── content_loc: (5,1)-(5,4) = "a\\b"
         │   ├── closing_loc: (5,4)-(5,5) = "/"
-        │   ├── unescaped: "a\b"
+        │   ├── unescaped: "a\\b"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
         │   ├── opening_loc: (7,0)-(7,1) = "/"
@@ -130,25 +130,25 @@
         │   ├── opening_loc: (15,0)-(15,3) = "%r/"
         │   ├── content_loc: (15,3)-(15,24) = "[a-z$._?][\\w$.?\#@~]*:"
         │   ├── closing_loc: (15,24)-(15,26) = "/i"
-        │   ├── unescaped: "[a-z$._?][w$.?\#@~]*:"
+        │   ├── unescaped: "[a-z$._?][\\w$.?\#@~]*:"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (17,0)-(17,37))
         │   ├── opening_loc: (17,0)-(17,3) = "%r/"
         │   ├── content_loc: (17,3)-(17,35) = "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         │   ├── closing_loc: (17,35)-(17,37) = "/i"
-        │   ├── unescaped: "([a-z$._?][w$.?\#@~]*)( +)(equ)"
+        │   ├── unescaped: "([a-z$._?][\\w$.?\#@~]*)(\\s+)(equ)"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (19,0)-(19,25))
         │   ├── opening_loc: (19,0)-(19,3) = "%r/"
         │   ├── content_loc: (19,3)-(19,23) = "[a-z$._?][\\w$.?\#@~]*"
         │   ├── closing_loc: (19,23)-(19,25) = "/i"
-        │   ├── unescaped: "[a-z$._?][w$.?\#@~]*"
+        │   ├── unescaped: "[a-z$._?][\\w$.?\#@~]*"
         │   └── flags: ignore_case
         ├── @ RegularExpressionNode (location: (21,0)-(24,1))
         │   ├── opening_loc: (21,0)-(21,3) = "%r("
         │   ├── content_loc: (21,3)-(23,0) = "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
         │   ├── closing_loc: (24,0)-(24,1) = ")"
-        │   ├── unescaped: "\n(?:[w\#$%_']|()|(,)|[]|[0-9])*\n  (?:[w\#$%_']+)\n"
+        │   ├── unescaped: "\n(?:[\\w\#$%_']|\\(\\)|\\(,\\)|\\[\\]|[0-9])*\n  (?:[\\w\#$%_']+)\n"
         │   └── flags: ∅
         ├── @ CallNode (location: (26,0)-(26,16))
         │   ├── receiver:
@@ -156,7 +156,7 @@
         │   │   ├── opening_loc: (26,0)-(26,1) = "/"
         │   │   ├── content_loc: (26,1)-(26,7) = "(?#\\))"
         │   │   ├── closing_loc: (26,7)-(26,8) = "/"
-        │   │   ├── unescaped: "(?#))"
+        │   │   ├── unescaped: "(?#\\))"
         │   │   └── flags: ∅
         │   ├── call_operator_loc: ∅
         │   ├── message_loc: (26,9)-(26,11) = "=~"
diff --git a/test/prism/snapshots/seattlerb/bug190.txt b/test/prism/snapshots/seattlerb/bug190.txt
index 527304835a..fec48914c9 100644
--- a/test/prism/snapshots/seattlerb/bug190.txt
+++ b/test/prism/snapshots/seattlerb/bug190.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,3) = "%r'"
             ├── content_loc: (1,3)-(1,5) = "\\'"
             ├── closing_loc: (1,5)-(1,6) = "'"
-            ├── unescaped: "'"
+            ├── unescaped: "\\'"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
index 3bc991033c..caf67b892d 100644
--- a/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_C_slash.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,6) = "\\cC\\d"
             ├── closing_loc: (1,6)-(1,7) = "/"
-            ├── unescaped: "\u0003d"
+            ├── unescaped: "\\x03\\d"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_esc_u.txt b/test/prism/snapshots/seattlerb/regexp_esc_u.txt
index adbfe36880..ea6bbb6141 100644
--- a/test/prism/snapshots/seattlerb/regexp_esc_u.txt
+++ b/test/prism/snapshots/seattlerb/regexp_esc_u.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,16) = "[\\u0021-\\u0027]"
             ├── closing_loc: (1,16)-(1,17) = "/"
-            ├── unescaped: "[!-']"
+            ├── unescaped: "[\\u0021-\\u0027]"
             └── flags: ∅
diff --git a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
index 5e039bd16e..74e8b52787 100644
--- a/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
+++ b/test/prism/snapshots/seattlerb/regexp_unicode_curlies.txt
@@ -7,11 +7,11 @@
         │   ├── opening_loc: (1,0)-(1,1) = "/"
         │   ├── content_loc: (1,1)-(1,14) = "\\u{c0de babe}"
         │   ├── closing_loc: (1,14)-(1,15) = "/"
-        │   ├── unescaped: "샞몾"
+        │   ├── unescaped: "\\u{c0de babe}"
         │   └── flags: ∅
         └── @ RegularExpressionNode (location: (3,0)-(3,8))
             ├── opening_loc: (3,0)-(3,1) = "/"
             ├── content_loc: (3,1)-(3,7) = "\\u{df}"
             ├── closing_loc: (3,7)-(3,8) = "/"
-            ├── unescaped: "ß"
+            ├── unescaped: "\\u{df}"
             └── flags: ∅
diff --git a/test/prism/snapshots/spanning_heredoc.txt b/test/prism/snapshots/spanning_heredoc.txt
index 2c59cb4368..6b3e3c92d7 100644
--- a/test/prism/snapshots/spanning_heredoc.txt
+++ b/test/prism/snapshots/spanning_heredoc.txt
@@ -28,10 +28,10 @@
         │   │           │       ├── @ InterpolatedRegularExpressionNode (location: (4,13)-(7,2))
         │   │           │       │   ├── opening_loc: (4,13)-(4,14) = "/"
         │   │           │       │   ├── parts: (length: 2)
-        │   │           │       │   │   ├── @ StringNode (location: (4,14)-(4,0))
+        │   │           │       │   │   ├── @ StringNode (location: (4,14)-(4,16))
         │   │           │       │   │   │   ├── flags: ∅
         │   │           │       │   │   │   ├── opening_loc: ∅
-        │   │           │       │   │   │   ├── content_loc: (4,14)-(4,0) = "b\\\n"
+        │   │           │       │   │   │   ├── content_loc: (4,14)-(4,16) = "b\\"
         │   │           │       │   │   │   ├── closing_loc: ∅
         │   │           │       │   │   │   └── unescaped: "b"
         │   │           │       │   │   └── @ StringNode (location: (7,0)-(7,1))
diff --git a/test/prism/snapshots/unescaping.txt b/test/prism/snapshots/unescaping.txt
index a59dc01626..ee7c3759cb 100644
--- a/test/prism/snapshots/unescaping.txt
+++ b/test/prism/snapshots/unescaping.txt
@@ -17,7 +17,7 @@
         │   ├── opening_loc: (3,0)-(3,1) = "/"
         │   ├── content_loc: (3,1)-(3,7) = "\\c\#{1}"
         │   ├── closing_loc: (3,7)-(3,8) = "/"
-        │   ├── unescaped: "\u0003{1}"
+        │   ├── unescaped: "\\x03{1}"
         │   └── flags: ∅
         ├── @ StringNode (location: (5,0)-(5,8))
         │   ├── flags: ∅
diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt
index 7c477382dc..21e73552ef 100644
--- a/test/prism/snapshots/unparser/corpus/literal/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/literal/literal.txt
@@ -545,7 +545,7 @@
         │   ├── opening_loc: (50,0)-(50,1) = "/"
         │   ├── content_loc: (50,1)-(50,27) = "[^-+',.\\/:@[:alnum:]\\[\\]]+"
         │   ├── closing_loc: (50,27)-(50,28) = "/"
-        │   ├── unescaped: "[^-+',./:@[:alnum:][]]+"
+        │   ├── unescaped: "[^-+',./:@[:alnum:]\\[\\]]+"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (51,0)-(51,12))
         │   ├── opening_loc: (51,0)-(51,1) = "/"
@@ -606,19 +606,19 @@
         │   ├── opening_loc: (54,0)-(54,1) = "/"
         │   ├── content_loc: (54,1)-(54,3) = "\\n"
         │   ├── closing_loc: (54,3)-(54,4) = "/"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: ∅
         ├── @ RegularExpressionNode (location: (55,0)-(55,4))
         │   ├── opening_loc: (55,0)-(55,1) = "/"
         │   ├── content_loc: (55,1)-(55,3) = "\\n"
         │   ├── closing_loc: (55,3)-(55,4) = "/"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: ∅
         ├── @ RegularExpressionNode (location: (56,0)-(56,5))
         │   ├── opening_loc: (56,0)-(56,1) = "/"
         │   ├── content_loc: (56,1)-(56,3) = "\\n"
         │   ├── closing_loc: (56,3)-(56,5) = "/x"
-        │   ├── unescaped: "\n"
+        │   ├── unescaped: "\\n"
         │   └── flags: extended
         ├── @ RegularExpressionNode (location: (57,0)-(57,7))
         │   ├── opening_loc: (57,0)-(57,1) = "/"
diff --git a/test/prism/snapshots/unparser/corpus/semantic/literal.txt b/test/prism/snapshots/unparser/corpus/semantic/literal.txt
index c79d0370da..6da3b56f33 100644
--- a/test/prism/snapshots/unparser/corpus/semantic/literal.txt
+++ b/test/prism/snapshots/unparser/corpus/semantic/literal.txt
@@ -33,7 +33,7 @@
         │   ├── opening_loc: (10,0)-(10,3) = "%r("
         │   ├── content_loc: (10,3)-(10,5) = "\\)"
         │   ├── closing_loc: (10,5)-(10,6) = ")"
-        │   ├── unescaped: ")"
+        │   ├── unescaped: "\\)"
         │   └── flags: ∅
         ├── @ InterpolatedRegularExpressionNode (location: (11,0)-(11,14))
         │   ├── opening_loc: (11,0)-(11,3) = "%r("
diff --git a/test/prism/snapshots/whitequark/parser_bug_830.txt b/test/prism/snapshots/whitequark/parser_bug_830.txt
index f19fffbba0..e380113372 100644
--- a/test/prism/snapshots/whitequark/parser_bug_830.txt
+++ b/test/prism/snapshots/whitequark/parser_bug_830.txt
@@ -7,5 +7,5 @@
             ├── opening_loc: (1,0)-(1,1) = "/"
             ├── content_loc: (1,1)-(1,3) = "\\("
             ├── closing_loc: (1,3)-(1,4) = "/"
-            ├── unescaped: "("
+            ├── unescaped: "\\("
             └── flags: ∅
diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb
index 123c139077..051b5e29d1 100644
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@@ -108,40 +108,50 @@ module Prism
     escapes = [*ascii, *ascii8, *newlines, *octal, *hex2, *hex4, *hex6, *ctrls]
 
     contexts = [
-      [Context::String.new("?", ""),              escapes],
-      [Context::String.new("'", "'"),             escapes],
-      [Context::String.new("\"", "\""),           escapes],
-      [Context::String.new("%q[", "]"),           escapes],
-      [Context::String.new("%Q[", "]"),           escapes],
-      [Context::String.new("%[", "]"),            escapes],
-      [Context::String.new("`", "`"),             escapes],
-      [Context::String.new("%x[", "]"),           escapes],
-      [Context::String.new("<<H\n", "\nH"),       escapes],
-      [Context::String.new("<<'H'\n", "\nH"),     escapes],
-      [Context::String.new("<<\"H\"\n", "\nH"),   escapes],
-      [Context::String.new("<<`H`\n", "\nH"),     escapes],
-      [Context::String.new("<<-H\n", "\nH"),      escapes],
-      [Context::String.new("<<-'H'\n", "\nH"),    escapes],
-      [Context::String.new("<<-\"H\"\n", "\nH"),  escapes],
-      [Context::String.new("<<-`H`\n", "\nH"),    escapes],
-      [Context::Heredoc.new("<<~H\n", "\nH"),     escapes],
-      [Context::Heredoc.new("<<~'H'\n", "\nH"),   escapes],
-      [Context::Heredoc.new("<<~\"H\"\n", "\nH"), escapes],
-      [Context::Heredoc.new("<<~`H`\n", "\nH"),   escapes],
-      [Context::List.new("%w[", "]"),             escapes],
-      [Context::List.new("%W[", "]"),             escapes],
-      [Context::List.new("%i[", "]"),             escapes],
-      [Context::List.new("%I[", "]"),             escapes],
-      [Context::Symbol.new("%s[", "]"),           escapes],
-      [Context::Symbol.new(":'", "'"),            escapes],
-      [Context::Symbol.new(":\"", "\""),          escapes],
-      # [Context::RegExp.new("/", "/"),            escapes],
-      # [Context::RegExp.new("%r[", "]"),          escapes]
+      Context::String.new("?", ""),
+      Context::String.new("'", "'"),
+      Context::String.new("\"", "\""),
+      Context::String.new("%q[", "]"),
+      Context::String.new("%Q[", "]"),
+      Context::String.new("%[", "]"),
+      Context::String.new("`", "`"),
+      Context::String.new("%x[", "]"),
+      Context::String.new("<<H\n", "\nH"),
+      Context::String.new("<<'H'\n", "\nH"),
+      Context::String.new("<<\"H\"\n", "\nH"),
+      Context::String.new("<<`H`\n", "\nH"),
+      Context::String.new("<<-H\n", "\nH"),
+      Context::String.new("<<-'H'\n", "\nH"),
+      Context::String.new("<<-\"H\"\n", "\nH"),
+      Context::String.new("<<-`H`\n", "\nH"),
+      Context::Heredoc.new("<<~H\n", "\nH"),
+      Context::Heredoc.new("<<~'H'\n", "\nH"),
+      Context::Heredoc.new("<<~\"H\"\n", "\nH"),
+      Context::Heredoc.new("<<~`H`\n", "\nH"),
+      Context::List.new("%w[", "]"),
+      Context::List.new("%w<", ">"),
+      Context::List.new("%W[", "]"),
+      Context::List.new("%i[", "]"),
+      Context::List.new("%I[", "]"),
+      Context::Symbol.new("%s[", "]"),
+      Context::Symbol.new(":'", "'"),
+      Context::Symbol.new(":\"", "\""),
+      Context::RegExp.new("/", "/"),
+      Context::RegExp.new("%r[", "]"),
+      Context::RegExp.new("%r<", ">"),
+      Context::RegExp.new("%r{", "}"),
+      Context::RegExp.new("%r(", ")"),
+      Context::RegExp.new("%r|", "|"),
     ]
 
-    contexts.each do |(context, escapes)|
+    contexts.each do |context|
       escapes.each do |escape|
-        next if context.name == "?" && escape == "\xFF".b # wat?
+        # I think this might be a bug in Ruby.
+        next if context.name == "?" && escape == "\xFF".b
+
+        # We don't currently support scanning for the number of capture groups,
+        # so these are all going to fail.
+        next if (context.name == "//" || context.name.start_with?("%r")) && escape.start_with?(/\d/)
 
         define_method(:"test_#{context.name}_#{escape.inspect}") do
           assert_unescape(context, escape)
author	Kevin Newton <kddnewton@gmail.com>	2023-10-12 08:46:40 -0400
committer	Kevin Newton <kddnewton@gmail.com>	2023-10-13 15:31:30 -0400
commit	fa76cddc5b1eebf77c9c5bbe951f70fd6c115716 (patch)
tree	bf98c1898db99a2d35aa759c98dfb259f02055a5 /test
parent	e4f1c06a9bb6012ac155b7a7789d2b5cb4e8abdc (diff)