Multiple codepoints are not allowed at single character literal

It has unintentionally passed since 2.5.
author: Nobuyoshi Nakada <nobu@ruby-lang.org> 2019-07-05 22:18:08 +0900
committer: Nobuyoshi Nakada <nobu@ruby-lang.org> 2019-07-05 22:39:54 +0900
commit: d746a41e85b746a90eef20c46d24880fe084ffc5 (patch)
tree: 1065357f1862a72fca4327cb29554ac6cb652fef
parent: 0a2f598d23ef54ce906ebe302cc06e07a16f9022 (diff)
2 files changed, 21 insertions, 5 deletions
diff --git a/parse.y b/parse.y
index 9ec262c5a5..3708601e35 100644
--- a/parse.y
+++ b/parse.y
@@ -6246,24 +6246,28 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
 /* return value is for ?\u3042 */
 static void
 tokadd_utf8(struct parser_params *p, rb_encoding **encp,
-	    int string_literal, int symbol_literal, int regexp_literal)
+	    int term, int symbol_literal, int regexp_literal)
 {
     /*
-     * If string_literal is true, then we allow multiple codepoints
-     * in \u{}, and add the codepoints to the current token.
-     * Otherwise we're parsing a character literal and return a single
-     * codepoint without adding it
+     * If `term` is not -1, then we allow multiple codepoints in \u{}
+     * upto `term` byte, otherwise we're parsing a character literal.
+     * And then add the codepoints to the current token.
      */
+    static const char multiple_codepoints[] = "Multiple codepoints at single character literal";
 
     const int open_brace = '{', close_brace = '}';
 
     if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
 
     if (peek(p, open_brace)) {  /* handle \u{...} form */
+	const char *second = NULL;
 	int c, last = nextc(p);
 	if (p->lex.pcur >= p->lex.pend) goto unterminated;
 	while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
 	while (c != close_brace) {
+	    if (c == term) goto unterminated;
+	    if (second == multiple_codepoints)
+		second = p->lex.pcur;
 	    if (regexp_literal) tokadd(p, last);
 	    if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
 		break;
@@ -6272,6 +6276,8 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 		if (++p->lex.pcur >= p->lex.pend) goto unterminated;
 		last = c;
 	    }
+	    if (term == -1 && !second)
+		second = multiple_codepoints;
 	}
 
 	if (c != close_brace) {
@@ -6280,6 +6286,15 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 	    yyerror0("unterminated Unicode escape");
 	    return;
 	}
+	if (second && second != multiple_codepoints) {
+	    const char *pcur = p->lex.pcur;
+	    p->lex.pcur = second;
+	    dispatch_scan_event(p, tSTRING_CONTENT);
+	    token_flush(p);
+	    p->lex.pcur = pcur;
+	    yyerror0(multiple_codepoints);
+	    token_flush(p);
+	}
 
 	if (regexp_literal) tokadd(p, close_brace);
 	nextc(p);
diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb
index 4a0c296a84..75f0896bce 100644
--- a/test/ruby/test_parse.rb
+++ b/test/ruby/test_parse.rb
@@ -577,6 +577,7 @@ class TestParse < Test::Unit::TestCase
     assert_equal("\u{1234}", eval("?\u{1234}"))
     assert_equal("\u{1234}", eval('?\u{1234}'))
     assert_equal("\u{1234}", eval('?\u1234'))
+    assert_syntax_error('?\u{41 42}', 'Multiple codepoints at single character literal')
     e = assert_syntax_error('"#{?\u123}"', 'invalid Unicode escape')
     assert_not_match(/end-of-input/, e.message)
author	Nobuyoshi Nakada <nobu@ruby-lang.org>	2019-07-05 22:18:08 +0900
committer	Nobuyoshi Nakada <nobu@ruby-lang.org>	2019-07-05 22:39:54 +0900
commit	d746a41e85b746a90eef20c46d24880fe084ffc5 (patch)
tree	1065357f1862a72fca4327cb29554ac6cb652fef
parent	0a2f598d23ef54ce906ebe302cc06e07a16f9022 (diff)