2 files changed, 139 insertions, 16 deletions
diff --git a/parse.y b/parse.y
index d312797b98..437f4d2845 100644
--- a/parse.y
+++ b/parse.y
@@ -1512,6 +1512,7 @@ YYLTYPE *rb_parser_set_location(struct parser_params *p, YYLTYPE *yylloc);
 void ruby_show_error_line(struct parser_params *p, VALUE errbuf, const YYLTYPE *yylloc, int lineno, rb_parser_string_t *str);
 RUBY_SYMBOL_EXPORT_END
 
+static void flush_string_content(struct parser_params *p, rb_encoding *enc, size_t back);
 static void error_duplicate_pattern_variable(struct parser_params *p, ID id, const YYLTYPE *loc);
 static void error_duplicate_pattern_key(struct parser_params *p, ID id, const YYLTYPE *loc);
 static VALUE formal_argument_error(struct parser_params*, ID);
@@ -7885,6 +7886,7 @@ tok_hex(struct parser_params *p, size_t *numlen)
 
     c = (int)ruby_scan_hex(p->lex.pcur, 2, numlen);
     if (!*numlen) {
+        flush_string_content(p, p->enc, rb_strlen_lit("\\x"));
         yyerror0("invalid hex escape");
         dispatch_scan_event(p, tSTRING_CONTENT);
         return 0;
@@ -7927,27 +7929,33 @@ escaped_control_code(int c)
 
 static int
 tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
-                 int regexp_literal, int wide)
+                 int regexp_literal, const char *begin)
 {
+    const int wide = !begin;
     size_t numlen;
     int codepoint = (int)ruby_scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen);
+
     p->lex.pcur += numlen;
     if (p->lex.strterm == NULL ||
         strterm_is_heredoc(p->lex.strterm) ||
         (p->lex.strterm->u.literal.func != str_regexp)) {
+        if (!begin) begin = p->lex.pcur;
         if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4))  {
-            literal_flush(p, p->lex.pcur);
+            flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin);
             yyerror0("invalid Unicode escape");
+            dispatch_scan_event(p, tSTRING_CONTENT);
             return wide && numlen > 0;
         }
         if (codepoint > 0x10ffff) {
-            literal_flush(p, p->lex.pcur);
+            flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin);
             yyerror0("invalid Unicode codepoint (too large)");
+            dispatch_scan_event(p, tSTRING_CONTENT);
             return wide;
         }
         if ((codepoint & 0xfffff800) == 0xd800) {
-            literal_flush(p, p->lex.pcur);
+            flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin);
             yyerror0("invalid Unicode codepoint");
+            dispatch_scan_event(p, tSTRING_CONTENT);
             return wide;
         }
     }
@@ -8035,7 +8043,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
                 if (second == multiple_codepoints)
                     second = p->lex.pcur;
                 if (regexp_literal) tokadd(p, last);
-                if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
+                if (!tokadd_codepoint(p, encp, regexp_literal, NULL)) {
                     break;
                 }
                 while (ISSPACE(c = peekc(p))) {
@@ -8048,8 +8056,9 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 
             if (c != close_brace) {
               unterminated:
-                token_flush(p);
+                flush_string_content(p, rb_utf8_encoding(), 0);
                 yyerror0("unterminated Unicode escape");
+                dispatch_scan_event(p, tSTRING_CONTENT);
                 return;
             }
             if (second && second != multiple_codepoints) {
@@ -8067,7 +8076,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
         }
     }
     else {			/* handle \uxxxx form */
-        if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
+        if (!tokadd_codepoint(p, encp, regexp_literal, p->lex.pcur - rb_strlen_lit("\\u"))) {
             token_flush(p);
             return;
         }
@@ -8078,7 +8087,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
 #define ESCAPE_META    2
 
 static int
-read_escape(struct parser_params *p, int flags)
+read_escape(struct parser_params *p, int flags, const char *begin)
 {
     int c;
     size_t numlen;
@@ -8137,7 +8146,7 @@ read_escape(struct parser_params *p, int flags)
                 nextc(p);
                 goto eof;
             }
-            return read_escape(p, flags|ESCAPE_META) | 0x80;
+            return read_escape(p, flags|ESCAPE_META, begin) | 0x80;
         }
         else if (c == -1) goto eof;
         else if (!ISASCII(c)) {
@@ -8170,7 +8179,7 @@ read_escape(struct parser_params *p, int flags)
                 nextc(p);
                 goto eof;
             }
-            c = read_escape(p, flags|ESCAPE_CONTROL);
+            c = read_escape(p, flags|ESCAPE_CONTROL, begin);
         }
         else if (c == '?')
             return 0177;
@@ -8205,6 +8214,7 @@ read_escape(struct parser_params *p, int flags)
 
       eof:
       case -1:
+        flush_string_content(p, p->enc, p->lex.pcur - begin);
         yyerror0("Invalid escape character syntax");
         dispatch_scan_event(p, tSTRING_CONTENT);
         return '\0';
@@ -8226,6 +8236,7 @@ tokadd_escape(struct parser_params *p)
 {
     int c;
     size_t numlen;
+    const char *begin = p->lex.pcur;
 
     switch (c = nextc(p)) {
       case '\n':
@@ -8251,6 +8262,7 @@ tokadd_escape(struct parser_params *p)
 
       eof:
       case -1:
+        flush_string_content(p, p->enc, p->lex.pcur - begin);
         yyerror0("Invalid escape character syntax");
         token_flush(p);
         return -1;
@@ -8521,7 +8533,7 @@ tokadd_string(struct parser_params *p,
                       case 'C':
                       case 'M': {
                         pushback(p, c);
-                        c = read_escape(p, 0);
+                        c = read_escape(p, 0, p->lex.pcur - 1);
 
                         char *t = tokspace(p, rb_strlen_lit("\\x00"));
                         *t++ = '\\';
@@ -8547,7 +8559,7 @@ tokadd_string(struct parser_params *p,
                 else if (func & STR_FUNC_EXPAND) {
                     pushback(p, c);
                     if (func & STR_FUNC_ESCAPE) tokadd(p, '\\');
-                    c = read_escape(p, 0);
+                    c = read_escape(p, 0, p->lex.pcur - 1);
                 }
                 else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) {
                     /* ignore backslashed spaces in %w */
@@ -8597,8 +8609,9 @@ tokadd_string(struct parser_params *p,
 #define NEW_STRTERM(func, term, paren) new_strterm(p, func, term, paren)
 
 static void
-flush_string_content(struct parser_params *p, rb_encoding *enc)
+flush_string_content(struct parser_params *p, rb_encoding *enc, size_t back)
 {
+    p->lex.pcur -= back;
     if (has_delayed_token(p)) {
         ptrdiff_t len = p->lex.pcur - p->lex.ptok;
         if (len > 0) {
@@ -8610,6 +8623,7 @@ flush_string_content(struct parser_params *p, rb_encoding *enc)
         p->lex.ptok = p->lex.pcur;
     }
     dispatch_scan_event(p, tSTRING_CONTENT);
+    p->lex.pcur += back;
 }
 
 /* this can be shared with ripper, since it's independent from struct
@@ -8777,7 +8791,7 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote)
     tokfix(p);
     lit = STR_NEW3(tok(p), toklen(p), enc, func);
     set_yylval_str(lit);
-    flush_string_content(p, enc);
+    flush_string_content(p, enc, 0);
 
     return tSTRING_CONTENT;
 }
@@ -9246,7 +9260,7 @@ here_document(struct parser_params *p, rb_strterm_heredoc_t *here)
 #ifndef RIPPER
                 if (bol) nd_set_fl_newline(yylval.node);
 #endif
-                flush_string_content(p, enc);
+                flush_string_content(p, enc, 0);
                 return tSTRING_CONTENT;
             }
             tokadd(p, nextc(p));
@@ -10064,7 +10078,7 @@ parse_qmark(struct parser_params *p, int space_seen)
             if (tokadd_mbchar(p, c) == -1) return 0;
         }
         else {
-            c = read_escape(p, 0);
+            c = read_escape(p, 0, p->lex.pcur - rb_strlen_lit("?\\"));
             tokadd(p, c);
         }
     }
diff --git a/test/ripper/test_lexer.rb b/test/ripper/test_lexer.rb
index 64b4336375..e460a7fd13 100644
--- a/test/ripper/test_lexer.rb
+++ b/test/ripper/test_lexer.rb
@@ -390,6 +390,115 @@ world"
     assert_lexer(expected, code)
   end
 
+  def test_invalid_escape_string
+    code = "\"hello\\x world"
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "hello", state(:EXPR_BEG)],
+      [[1, 5], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[1, 7], :on_tstring_content, " world", state(:EXPR_BEG)],
+    ]
+
+    code = "\"\nhello\\x world"
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n" "hello", state(:EXPR_BEG)],
+      [[2, 5], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[2, 7], :on_tstring_content, " world", state(:EXPR_BEG)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "\"\n\\Cxx\""
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\Cx", state(:EXPR_BEG)],
+      [[2, 3], :on_tstring_content, "x", state(:EXPR_BEG)],
+      [[2, 4], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "\"\n\\Mxx\""
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\Mx", state(:EXPR_BEG)],
+      [[2, 3], :on_tstring_content, "x", state(:EXPR_BEG)],
+      [[2, 4], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "\"\n\\c\\cx\""
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\c\\c", state(:EXPR_BEG)],
+      [[2, 4], :on_tstring_content, "x", state(:EXPR_BEG)],
+      [[2, 5], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "\"\n\\ux\""
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\u", state(:EXPR_BEG)],
+      [[2, 2], :on_tstring_content, "x", state(:EXPR_BEG)],
+      [[2, 3], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "\"\n\\xx\""
+    expected = [
+      [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)],
+      [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[2, 2], :on_tstring_content, "x", state(:EXPR_BEG)],
+      [[2, 3], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "<<A\n\n\\xyz"
+    expected = [
+      [[1, 0], :on_heredoc_beg, "<<A", state(:EXPR_BEG)],
+      [[1, 3], :on_nl, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[3, 0], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[3, 2], :on_tstring_content, "yz", state(:EXPR_BEG)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "%(\n\\xyz)"
+    expected = [
+      [[1, 0], :on_tstring_beg, "%(", state(:EXPR_BEG)],
+      [[1, 2], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[2, 2], :on_tstring_content, "yz", state(:EXPR_BEG)],
+      [[2, 4], :on_tstring_end, ")", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = "%Q(\n\\xyz)"
+    expected = [
+      [[1, 0], :on_tstring_beg, "%Q(", state(:EXPR_BEG)],
+      [[1, 3], :on_tstring_content, "\n", state(:EXPR_BEG)],
+      [[2, 0], :on_tstring_content, "\\x", state(:EXPR_BEG)],
+      [[2, 2], :on_tstring_content, "yz", state(:EXPR_BEG)],
+      [[2, 4], :on_tstring_end, ")", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+
+    code = ":\"\n\\xyz\""
+    expected = [
+      [[1, 0], :on_symbeg, ":\"", state(:EXPR_FNAME)],
+      [[1, 2], :on_tstring_content, "\n", state(:EXPR_FNAME)],
+      [[2, 0], :on_tstring_content, "\\x", state(:EXPR_FNAME)],
+      [[2, 2], :on_tstring_content, "yz", state(:EXPR_FNAME)],
+      [[2, 4], :on_tstring_end, "\"", state(:EXPR_END)],
+    ]
+    assert_lexer(expected, code)
+  end
+
   def assert_lexer(expected, code)
     assert_equal(code, Ripper.tokenize(code).join(""))
     assert_equal(expected, result = Ripper.lex(code),