summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--parse.y97
-rw-r--r--test/ruby/test_parse.rb16
2 files changed, 79 insertions, 34 deletions
diff --git a/parse.y b/parse.y
index 13cb3842d5..abc980d6fc 100644
--- a/parse.y
+++ b/parse.y
@@ -7279,6 +7279,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
return TRUE;
}
+static int tokadd_mbchar(struct parser_params *p, int c);
+
/* return value is for ?\u3042 */
static void
tokadd_utf8(struct parser_params *p, rb_encoding **encp,
@@ -7296,44 +7298,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
if (peek(p, open_brace)) { /* handle \u{...} form */
- const char *second = NULL;
- int c, last = nextc(p);
- if (p->lex.pcur >= p->lex.pend) goto unterminated;
- while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
- while (c != close_brace) {
- if (c == term) goto unterminated;
- if (second == multiple_codepoints)
- second = p->lex.pcur;
- if (regexp_literal) tokadd(p, last);
- if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
- break;
- }
- while (ISSPACE(c = *p->lex.pcur)) {
- if (++p->lex.pcur >= p->lex.pend) goto unterminated;
- last = c;
+ if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) {
+ /*
+ * Skip parsing validation code and copy bytes as-is until term or
+ * closing brace, in order to correctly handle extended regexps where
+ * invalid unicode escapes are allowed in comments. The regexp parser
+ * does its own validation and will catch any issues.
+ */
+ int c = *p->lex.pcur;
+ tokadd(p, c);
+ for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) {
+ if (c == close_brace) {
+ tokadd(p, c);
+ ++p->lex.pcur;
+ break;
+ }
+ else if (c == term) {
+ break;
+ }
+ if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) {
+ tokadd(p, c);
+ c = *++p->lex.pcur;
+ }
+ tokadd_mbchar(p, c);
}
- if (term == -1 && !second)
- second = multiple_codepoints;
}
+ else {
+ const char *second = NULL;
+ int c, last = nextc(p);
+ if (p->lex.pcur >= p->lex.pend) goto unterminated;
+ while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
+ while (c != close_brace) {
+ if (c == term) goto unterminated;
+ if (second == multiple_codepoints)
+ second = p->lex.pcur;
+ if (regexp_literal) tokadd(p, last);
+ if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
+ break;
+ }
+ while (ISSPACE(c = *p->lex.pcur)) {
+ if (++p->lex.pcur >= p->lex.pend) goto unterminated;
+ last = c;
+ }
+ if (term == -1 && !second)
+ second = multiple_codepoints;
+ }
- if (c != close_brace) {
- unterminated:
- token_flush(p);
- yyerror0("unterminated Unicode escape");
- return;
- }
- if (second && second != multiple_codepoints) {
- const char *pcur = p->lex.pcur;
- p->lex.pcur = second;
- dispatch_scan_event(p, tSTRING_CONTENT);
- token_flush(p);
- p->lex.pcur = pcur;
- yyerror0(multiple_codepoints);
- token_flush(p);
- }
+ if (c != close_brace) {
+ unterminated:
+ token_flush(p);
+ yyerror0("unterminated Unicode escape");
+ return;
+ }
+ if (second && second != multiple_codepoints) {
+ const char *pcur = p->lex.pcur;
+ p->lex.pcur = second;
+ dispatch_scan_event(p, tSTRING_CONTENT);
+ token_flush(p);
+ p->lex.pcur = pcur;
+ yyerror0(multiple_codepoints);
+ token_flush(p);
+ }
- if (regexp_literal) tokadd(p, close_brace);
- nextc(p);
+ if (regexp_literal) tokadd(p, close_brace);
+ nextc(p);
+ }
}
else { /* handle \uxxxx form */
if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb
index bf0d9f1bd5..cf989d190b 100644
--- a/test/ruby/test_parse.rb
+++ b/test/ruby/test_parse.rb
@@ -1052,6 +1052,22 @@ x = __ENCODING__
assert_syntax_error(" 0b\n", /\^/)
end
+ def test_unclosed_unicode_escape_at_eol_bug_19750
+ assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}")
+ begin;
+ assert_syntax_error("/\\u", /too short escape sequence/)
+ assert_syntax_error("/\\u{", /unterminated regexp meets end of file/)
+ assert_syntax_error("/\\u{\\n", /invalid Unicode list/)
+ assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/)
+ re = eval("/a#\\u{\n$/x")
+ assert_match(re, 'a')
+ assert_not_match(re, 'a#')
+ re = eval("/a#\\u\n$/x")
+ assert_match(re, 'a')
+ assert_not_match(re, 'a#')
+ end;
+ end
+
def test_error_def_in_argument
assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
begin;