Handle unterminated unicode escapes in regexps

This fixes an infinite loop possible after ec3542229b29ec93062e9d90e877ea29d3c19472. For \u{} escapes in regexps, skip validation in the parser, and rely on the regexp code to handle validation. This is necessary so that invalid unicode escapes in comments in extended regexps are allowed. Fixes [Bug #19750] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
author: Jeremy Evans <code@jeremyevans.net> 2023-06-30 19:37:53 -0700
committer: GitHub <noreply@github.com> 2023-06-30 19:37:53 -0700
commit: 1bc8838d60ef3fc6812d3b64ed87caaf0ae943d9 (patch)
tree: bb705036e9357d0bd70e64d3f5e86192761fd41c
parent: 41779fede04d730f94690ddc9f2b36a4ff73ddb0 (diff)
2 files changed, 79 insertions, 34 deletions
diff --git a/parse.y b/parse.y
index 13cb3842d5..abc980d6fc 100644
--- a/parse.y
+++ b/parse.y
@@ -7279,6 +7279,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
     return TRUE;
 }
 
+static int tokadd_mbchar(struct parser_params *p, int c);
+
 /* return value is for ?\u3042 */
 static void
 tokadd_utf8(struct parser_params *p, rb_encoding **encp,
@@ -7296,44 +7298,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
     if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
 
     if (peek(p, open_brace)) {  /* handle \u{...} form */
-        const char *second = NULL;
-        int c, last = nextc(p);
-        if (p->lex.pcur >= p->lex.pend) goto unterminated;
-        while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
-        while (c != close_brace) {
-            if (c == term) goto unterminated;
-            if (second == multiple_codepoints)
-                second = p->lex.pcur;
-            if (regexp_literal) tokadd(p, last);
-            if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
-                break;
-            }
-            while (ISSPACE(c = *p->lex.pcur)) {
-                if (++p->lex.pcur >= p->lex.pend) goto unterminated;
-                last = c;
+        if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) {
+            /*
+             * Skip parsing validation code and copy bytes as-is until term or
+             * closing brace, in order to correctly handle extended regexps where
+             * invalid unicode escapes are allowed in comments. The regexp parser
+             * does its own validation and will catch any issues.
+             */
+            int c = *p->lex.pcur;
+            tokadd(p, c);
+            for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) {
+                if (c == close_brace) {
+                    tokadd(p, c);
+                    ++p->lex.pcur;
+                    break;
+                }
+                else if (c == term) {
+                    break;
+                }
+                if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) {
+                    tokadd(p, c);
+                    c = *++p->lex.pcur;
+                }
+                tokadd_mbchar(p, c);
             }
-            if (term == -1 && !second)
-                second = multiple_codepoints;
         }
+        else {
+            const char *second = NULL;
+            int c, last = nextc(p);
+            if (p->lex.pcur >= p->lex.pend) goto unterminated;
+            while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
+            while (c != close_brace) {
+                if (c == term) goto unterminated;
+                if (second == multiple_codepoints)
+                    second = p->lex.pcur;
+                if (regexp_literal) tokadd(p, last);
+                if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
+                    break;
+                }
+                while (ISSPACE(c = *p->lex.pcur)) {
+                    if (++p->lex.pcur >= p->lex.pend) goto unterminated;
+                    last = c;
+                }
+                if (term == -1 && !second)
+                    second = multiple_codepoints;
+            }
 
-        if (c != close_brace) {
-          unterminated:
-            token_flush(p);
-            yyerror0("unterminated Unicode escape");
-            return;
-        }
-        if (second && second != multiple_codepoints) {
-            const char *pcur = p->lex.pcur;
-            p->lex.pcur = second;
-            dispatch_scan_event(p, tSTRING_CONTENT);
-            token_flush(p);
-            p->lex.pcur = pcur;
-            yyerror0(multiple_codepoints);
-            token_flush(p);
-        }
+            if (c != close_brace) {
+              unterminated:
+                token_flush(p);
+                yyerror0("unterminated Unicode escape");
+                return;
+            }
+            if (second && second != multiple_codepoints) {
+                const char *pcur = p->lex.pcur;
+                p->lex.pcur = second;
+                dispatch_scan_event(p, tSTRING_CONTENT);
+                token_flush(p);
+                p->lex.pcur = pcur;
+                yyerror0(multiple_codepoints);
+                token_flush(p);
+            }
 
-        if (regexp_literal) tokadd(p, close_brace);
-        nextc(p);
+            if (regexp_literal) tokadd(p, close_brace);
+            nextc(p);
+        }
     }
     else {			/* handle \uxxxx form */
         if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb
index bf0d9f1bd5..cf989d190b 100644
--- a/test/ruby/test_parse.rb
+++ b/test/ruby/test_parse.rb
@@ -1052,6 +1052,22 @@ x = __ENCODING__
     assert_syntax_error("    0b\n", /\^/)
   end
 
+  def test_unclosed_unicode_escape_at_eol_bug_19750
+    assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}")
+    begin;
+      assert_syntax_error("/\\u", /too short escape sequence/)
+      assert_syntax_error("/\\u{", /unterminated regexp meets end of file/)
+      assert_syntax_error("/\\u{\\n", /invalid Unicode list/)
+      assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/)
+      re = eval("/a#\\u{\n$/x")
+      assert_match(re, 'a')
+      assert_not_match(re, 'a#')
+      re = eval("/a#\\u\n$/x")
+      assert_match(re, 'a')
+      assert_not_match(re, 'a#')
+    end;
+  end
+
   def test_error_def_in_argument
     assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
     begin;
author	Jeremy Evans <code@jeremyevans.net>	2023-06-30 19:37:53 -0700
committer	GitHub <noreply@github.com>	2023-06-30 19:37:53 -0700
commit	1bc8838d60ef3fc6812d3b64ed87caaf0ae943d9 (patch)
tree	bb705036e9357d0bd70e64d3f5e86192761fd41c
parent	41779fede04d730f94690ddc9f2b36a4ff73ddb0 (diff)