2 files changed, 79 insertions, 34 deletions
diff --git a/parse.y b/parse.y
index 13cb3842d5..abc980d6fc 100644
--- a/parse.y
+++ b/parse.y
@@ -7279,6 +7279,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
     return TRUE;
 }
 
+static int tokadd_mbchar(struct parser_params *p, int c);
+
 /* return value is for ?\u3042 */
 static void
 tokadd_utf8(struct parser_params *p, rb_encoding **encp,
@@ -7296,44 +7298,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
     if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
 
     if (peek(p, open_brace)) {  /* handle \u{...} form */
-        const char *second = NULL;
-        int c, last = nextc(p);
-        if (p->lex.pcur >= p->lex.pend) goto unterminated;
-        while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
-        while (c != close_brace) {
-            if (c == term) goto unterminated;
-            if (second == multiple_codepoints)
-                second = p->lex.pcur;
-            if (regexp_literal) tokadd(p, last);
-            if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
-                break;
-            }
-            while (ISSPACE(c = *p->lex.pcur)) {
-                if (++p->lex.pcur >= p->lex.pend) goto unterminated;
-                last = c;
+        if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) {
+            /*
+             * Skip parsing validation code and copy bytes as-is until term or
+             * closing brace, in order to correctly handle extended regexps where
+             * invalid unicode escapes are allowed in comments. The regexp parser
+             * does its own validation and will catch any issues.
+             */
+            int c = *p->lex.pcur;
+            tokadd(p, c);
+            for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) {
+                if (c == close_brace) {
+                    tokadd(p, c);
+                    ++p->lex.pcur;
+                    break;
+                }
+                else if (c == term) {
+                    break;
+                }
+                if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) {
+                    tokadd(p, c);
+                    c = *++p->lex.pcur;
+                }
+                tokadd_mbchar(p, c);
             }
-            if (term == -1 && !second)
-                second = multiple_codepoints;
         }
+        else {
+            const char *second = NULL;
+            int c, last = nextc(p);
+            if (p->lex.pcur >= p->lex.pend) goto unterminated;
+            while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
+            while (c != close_brace) {
+                if (c == term) goto unterminated;
+                if (second == multiple_codepoints)
+                    second = p->lex.pcur;
+                if (regexp_literal) tokadd(p, last);
+                if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
+                    break;
+                }
+                while (ISSPACE(c = *p->lex.pcur)) {
+                    if (++p->lex.pcur >= p->lex.pend) goto unterminated;
+                    last = c;
+                }
+                if (term == -1 && !second)
+                    second = multiple_codepoints;
+            }
 
-        if (c != close_brace) {
-          unterminated:
-            token_flush(p);
-            yyerror0("unterminated Unicode escape");
-            return;
-        }
-        if (second && second != multiple_codepoints) {
-            const char *pcur = p->lex.pcur;
-            p->lex.pcur = second;
-            dispatch_scan_event(p, tSTRING_CONTENT);
-            token_flush(p);
-            p->lex.pcur = pcur;
-            yyerror0(multiple_codepoints);
-            token_flush(p);
-        }
+            if (c != close_brace) {
+              unterminated:
+                token_flush(p);
+                yyerror0("unterminated Unicode escape");
+                return;
+            }
+            if (second && second != multiple_codepoints) {
+                const char *pcur = p->lex.pcur;
+                p->lex.pcur = second;
+                dispatch_scan_event(p, tSTRING_CONTENT);
+                token_flush(p);
+                p->lex.pcur = pcur;
+                yyerror0(multiple_codepoints);
+                token_flush(p);
+            }
 
-        if (regexp_literal) tokadd(p, close_brace);
-        nextc(p);
+            if (regexp_literal) tokadd(p, close_brace);
+            nextc(p);
+        }
     }
     else {			/* handle \uxxxx form */
         if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb
index bf0d9f1bd5..cf989d190b 100644
--- a/test/ruby/test_parse.rb
+++ b/test/ruby/test_parse.rb
@@ -1052,6 +1052,22 @@ x = __ENCODING__
     assert_syntax_error("    0b\n", /\^/)
   end
 
+  def test_unclosed_unicode_escape_at_eol_bug_19750
+    assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}")
+    begin;
+      assert_syntax_error("/\\u", /too short escape sequence/)
+      assert_syntax_error("/\\u{", /unterminated regexp meets end of file/)
+      assert_syntax_error("/\\u{\\n", /invalid Unicode list/)
+      assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/)
+      re = eval("/a#\\u{\n$/x")
+      assert_match(re, 'a')
+      assert_not_match(re, 'a#')
+      re = eval("/a#\\u\n$/x")
+      assert_match(re, 'a')
+      assert_not_match(re, 'a#')
+    end;
+  end
+
   def test_error_def_in_argument
     assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
     begin;