[ruby/prism] Better invalid token messages

https://github.com/ruby/prism/commit/8c9bed2a4d
author: Kevin Newton <kddnewton@gmail.com> 2024-02-06 12:59:47 -0500
committer: git <svn-admin@ruby-lang.org> 2024-02-06 18:10:50 +0000
commit: f5b368df0ceb1e705cd94e39ef8459dae07e6d52 (patch)
tree: 602618ff2ff30b8ca7d188fe0b4e071ac02366a0
parent: ccec209b2cced2ddb8463c4933ef729a44d0363c (diff)
4 files changed, 21 insertions, 7 deletions
diff --git a/prism/diagnostic.c b/prism/diagnostic.c
index df7ae381ba..c718246c80 100644
--- a/prism/diagnostic.c
+++ b/prism/diagnostic.c
@@ -198,8 +198,10 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
     [PM_ERR_INVALID_NUMBER_HEXADECIMAL]         = { "invalid hexadecimal number", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_NUMBER_OCTAL]               = { "invalid octal number", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_NUMBER_UNDERSCORE]          = { "invalid underscore placement in number", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_INVALID_CHARACTER]                  = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_INVALID_MULTIBYTE_CHARACTER]        = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL },
+    [PM_ERR_INVALID_PRINTABLE_CHARACTER]        = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_INVALID_PERCENT]                    = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT?
-    [PM_ERR_INVALID_TOKEN]                      = { "invalid token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT?
     [PM_ERR_INVALID_VARIABLE_GLOBAL]            = { "invalid global variable", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_IT_NOT_ALLOWED]                     = { "`it` is not allowed when an ordinary parameter is defined", PM_ERROR_LEVEL_FATAL },
     [PM_ERR_LAMBDA_OPEN]                        = { "expected a `do` keyword or a `{` to open the lambda block", PM_ERROR_LEVEL_FATAL },
diff --git a/prism/diagnostic.h b/prism/diagnostic.h
index 35a5c88793..019afb96b3 100644
--- a/prism/diagnostic.h
+++ b/prism/diagnostic.h
@@ -196,8 +196,10 @@ typedef enum {
     PM_ERR_INVALID_NUMBER_HEXADECIMAL,
     PM_ERR_INVALID_NUMBER_OCTAL,
     PM_ERR_INVALID_NUMBER_UNDERSCORE,
+    PM_ERR_INVALID_CHARACTER,
+    PM_ERR_INVALID_MULTIBYTE_CHARACTER,
+    PM_ERR_INVALID_PRINTABLE_CHARACTER,
     PM_ERR_INVALID_PERCENT,
-    PM_ERR_INVALID_TOKEN,
     PM_ERR_INVALID_VARIABLE_GLOBAL,
     PM_ERR_IT_NOT_ALLOWED,
     PM_ERR_LAMBDA_OPEN,
diff --git a/prism/prism.c b/prism/prism.c
index 3ed55f06d8..22503fd726 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -9590,11 +9590,21 @@ parser_lex(pm_parser_t *parser) {
                     if (*parser->current.start != '_') {
                         size_t width = char_is_identifier_start(parser, parser->current.start);
 
-                        // If this isn't the beginning of an identifier, then it's an invalid
-                        // token as we've exhausted all of the other options. We'll skip past
-                        // it and return the next token.
+                        // If this isn't the beginning of an identifier, then
+                        // it's an invalid token as we've exhausted all of the
+                        // other options. We'll skip past it and return the next
+                        // token after adding an appropriate error message.
                         if (!width) {
-                            pm_parser_err_current(parser, PM_ERR_INVALID_TOKEN);
+                            pm_diagnostic_id_t diag_id;
+                            if (*parser->current.start >= 0x80) {
+                                diag_id = PM_ERR_INVALID_MULTIBYTE_CHARACTER;
+                            } else if (char_is_ascii_printable(*parser->current.start) || (*parser->current.start == '\\')) {
+                                diag_id = PM_ERR_INVALID_PRINTABLE_CHARACTER;
+                            } else {
+                                diag_id = PM_ERR_INVALID_CHARACTER;
+                            }
+
+                            PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, *parser->current.start);
                             goto lex_next_token;
                         }
 
diff --git a/test/prism/format_errors_test.rb b/test/prism/format_errors_test.rb
index bc0b26165d..a142e8eee1 100644
--- a/test/prism/format_errors_test.rb
+++ b/test/prism/format_errors_test.rb
@@ -16,7 +16,7 @@ module Prism
       assert_equal <<~'ERROR', Debug.format_errors('"%W"\u"', false)
         > 1 | "%W"\u"
             |     ^ expected a newline or semicolon after the statement
-            |     ^ invalid token
+            |     ^ invalid character `\`
             |        ^ expected a closing delimiter for the string literal
       ERROR
     end
author	Kevin Newton <kddnewton@gmail.com>	2024-02-06 12:59:47 -0500
committer	git <svn-admin@ruby-lang.org>	2024-02-06 18:10:50 +0000
commit	f5b368df0ceb1e705cd94e39ef8459dae07e6d52 (patch)
tree	602618ff2ff30b8ca7d188fe0b4e071ac02366a0
parent	ccec209b2cced2ddb8463c4933ef729a44d0363c (diff)