parse.y: reject invalid codepoint

* parse.y (parser_tokadd_codepoint): reject invalid codepoint, surrogate blocks and surrogate pair, as well as mruby. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@56956 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
author: nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2016-12-01 08:26:39 +0000
committer: nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e> 2016-12-01 08:26:39 +0000
commit: 74495cfa11743a9cd1b27da81968cfbc00538bc4 (patch)
tree: 5699146b5d7f33d1588d5152805b1490a9568211
parent: df53b1421bd90b79dda52ab39fff9e06941fa978 (diff)
2 files changed, 20 insertions, 12 deletions
diff --git a/parse.y b/parse.y
index b3217adb26..d58d14c4dd 100644
--- a/parse.y
+++ b/parse.y
@@ -5757,11 +5757,15 @@ parser_tok_hex(struct parser_params *parser, size_t *numlen)
 
 #define tokcopy(n) memcpy(tokspace(n), lex_p - (n), (n))
 
-static void
+static int
 parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
 			int string_literal, int regexp_literal,
 			int codepoint, int numlen)
 {
+    if ((codepoint & 0xfffff800) == 0xd800) {
+	yyerror("invalid Unicode codepoint");
+	return FALSE;
+    }
     lex_p += numlen;
     if (regexp_literal) {
 	tokcopy(numlen);
@@ -5773,6 +5777,7 @@ parser_tokadd_codepoint(struct parser_params *parser, rb_encoding **encp,
     else if (string_literal) {
 	tokadd(codepoint);
     }
+    return TRUE;
 }
 
 /* return value is for ?\u3042 */
@@ -5806,8 +5811,11 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
 		yyerror("invalid Unicode codepoint (too large)");
 		return 0;
 	    }
-	    parser_tokadd_codepoint(parser, encp,string_literal, regexp_literal,
-				    codepoint, (int)numlen);
+	    if (!parser_tokadd_codepoint(parser, encp,
+					 string_literal, regexp_literal,
+					 codepoint, (int)numlen)) {
+		return 0;
+	    }
 	    if (ISSPACE(c = nextc())) last = c;
 	} while (string_literal && c != close_brace);
 
@@ -5824,8 +5832,11 @@ parser_tokadd_utf8(struct parser_params *parser, rb_encoding **encp,
 	    yyerror("invalid Unicode escape");
 	    return 0;
 	}
-	parser_tokadd_codepoint(parser, encp, string_literal, regexp_literal,
-				codepoint, 4);
+	if (!parser_tokadd_codepoint(parser, encp,
+				     string_literal, regexp_literal,
+				     codepoint, 4)) {
+	    return 0;
+	}
     }
 
     return codepoint;
diff --git a/test/ruby/test_unicode_escape.rb b/test/ruby/test_unicode_escape.rb
index a7ec3a8ecc..108cf804e2 100644
--- a/test/ruby/test_unicode_escape.rb
+++ b/test/ruby/test_unicode_escape.rb
@@ -264,12 +264,9 @@ EOS
      assert_raise(SyntaxError) { eval %q("\u{ 123 456}")}  # extra space
      assert_raise(SyntaxError) { eval %q("\u{123  456}")}  # extra space
 
-# The utf-8 encoding object currently does not object to codepoints
-# in the surrogate blocks, so these do not raise an error.
-#     assert_raise(SyntaxError) { "\uD800" }       # surrogate block
-#     assert_raise(SyntaxError) { "\uDCBA" }       # surrogate block
-#     assert_raise(SyntaxError) { "\uDFFF" }       # surrogate block
-#     assert_raise(SyntaxError) { "\uD847\uDD9A" } # surrogate pair
-
+     assert_raise(SyntaxError) { eval %q("\uD800") }       # surrogate block
+     assert_raise(SyntaxError) { eval %q("\uDCBA") }       # surrogate block
+     assert_raise(SyntaxError) { eval %q("\uDFFF") }       # surrogate block
+     assert_raise(SyntaxError) { eval %q("\uD847\uDD9A") } # surrogate pair
   end
 end
author	nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2016-12-01 08:26:39 +0000
committer	nobu <nobu@b2dd03c8-39d4-4d8f-98ff-823fe69b080e>	2016-12-01 08:26:39 +0000
commit	74495cfa11743a9cd1b27da81968cfbc00538bc4 (patch)
tree	5699146b5d7f33d1588d5152805b1490a9568211
parent	df53b1421bd90b79dda52ab39fff9e06941fa978 (diff)