[ruby/yarp] fix: octal, hex, and unicode strings at the end of a

file (https://github.com/ruby/yarp/pull/1371) * refactor: move EOF check into yp_unescape_calculate_difference parser_lex is a bit more readable when we can rely on that behavior * fix: octal and hex digits at the end of a file Previously this resulted in invalid memory access. * fix: unicode strings at the end of a file Previously this resulted in invalid memory access. * Unterminated curly-bracket unicode is a syntax error https://github.com/ruby/yarp/commit/21cf11acb5
author: Mike Dalessio <mike.dalessio@gmail.com> 2023-08-31 18:40:29 -0400
committer: git <svn-admin@ruby-lang.org> 2023-08-31 22:40:35 +0000
commit: df4c77608e76068deed58b2781674b0eb247c325 (patch)
tree: 4d159be612246f572dbce77d9bc1f49b4c2af6f7
parent: 9930363aab6ac4b8d7034baff85cd86c17953dc9 (diff)
4 files changed, 66 insertions, 38 deletions
diff --git a/test/yarp/errors_test.rb b/test/yarp/errors_test.rb
index 0e1931ec6a..9e60f5d98b 100644
--- a/test/yarp/errors_test.rb
+++ b/test/yarp/errors_test.rb
@@ -621,6 +621,13 @@ module YARP
       ]
     end
 
+    def test_unterminated_unicode_brackets_should_be_a_syntax_error
+      assert_errors expression('?\\u{3'), '?\\u{3', [
+        ["invalid Unicode escape.", 1..5],
+        ["invalid Unicode escape.", 1..5],
+      ]
+    end
+
     def test_method_parameters_after_block
       expected = DefNode(
         Location(),
diff --git a/test/yarp/fuzzer_test.rb b/test/yarp/fuzzer_test.rb
index 568c2eaf08..f4abcd4ac8 100644
--- a/test/yarp/fuzzer_test.rb
+++ b/test/yarp/fuzzer_test.rb
@@ -22,6 +22,19 @@ module YARP
     snippet "incomplete escaped list", "%w[\\"
     snippet "incomplete escaped regex", "/a\\"
     snippet "unterminated heredoc with unterminated escape at end of file", "<<A\n\\"
+    snippet "escaped octal at end of file 1", '"\\3'
+    snippet "escaped octal at end of file 2", '"\\33'
+    snippet "escaped hex at end of file 1", '"\\x'
+    snippet "escaped hex at end of file 2", '"\\x3'
+    snippet "escaped unicode at end of file 1", '"\\u{3'
+    snippet "escaped unicode at end of file 2", '"\\u{33'
+    snippet "escaped unicode at end of file 3", '"\\u{333'
+    snippet "escaped unicode at end of file 4", '"\\u{3333'
+    snippet "escaped unicode at end of file 5", '"\\u{33333'
+    snippet "escaped unicode at end of file 6", '"\\u{333333'
+    snippet "escaped unicode at end of file 7", '"\\u3'
+    snippet "escaped unicode at end of file 8", '"\\u33'
+    snippet "escaped unicode at end of file 9", '"\\u333'
 
     snippet "statements node with multiple heredocs", <<~EOF
       for <<A + <<B
diff --git a/yarp/unescape.c b/yarp/unescape.c
index b0aabf5e22..14c0faf2eb 100644
--- a/yarp/unescape.c
+++ b/yarp/unescape.c
@@ -69,17 +69,15 @@ char_is_ascii_printable(const uint8_t b) {
 // Scan the 1-3 digits of octal into the value. Returns the number of digits
 // scanned.
 static inline size_t
-unescape_octal(const uint8_t *backslash, uint8_t *value) {
+unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
     *value = (uint8_t) (backslash[1] - '0');
-    if (!yp_char_is_octal_digit(backslash[2])) {
+    if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
         return 2;
     }
-
     *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
-    if (!yp_char_is_octal_digit(backslash[3])) {
+    if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
         return 3;
     }
-
     *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
     return 4;
 }
@@ -93,12 +91,15 @@ unescape_hexadecimal_digit(const uint8_t value) {
 // Scan the 1-2 digits of hexadecimal into the value. Returns the number of
 // digits scanned.
 static inline size_t
-unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) {
+unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
+    *value = 0;
+    if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
+        return 2;
+    }
     *value = unescape_hexadecimal_digit(backslash[2]);
-    if (!yp_char_is_hexadecimal_digit(backslash[3])) {
+    if (backslash + 3 >=  end || !yp_char_is_hexadecimal_digit(backslash[3])) {
         return 3;
     }
-
     *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
     return 4;
 }
@@ -204,7 +205,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
         case '0': case '1': case '2': case '3': case '4':
         case '5': case '6': case '7': case '8': case '9': {
             uint8_t value;
-            const uint8_t *cursor = backslash + unescape_octal(backslash, &value);
+            const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
 
             if (dest) {
                 dest[(*dest_length)++] = unescape_char(value, flags);
@@ -214,7 +215,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
         // \xnn         hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
         case 'x': {
             uint8_t value;
-            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value);
+            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end);
 
             if (dest) {
                 dest[(*dest_length)++] = unescape_char(value, flags);
@@ -236,13 +237,14 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
 
                 unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
 
-                while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
+                while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
                     const uint8_t *unicode_start = unicode_cursor;
                     size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
 
                     // \u{nnnn} character literal allows only 1-6 hexadecimal digits
-                    if (hexadecimal_length > 6)
+                    if (hexadecimal_length > 6) {
                         yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
+                    }
 
                     // there are not hexadecimal characters
                     if (hexadecimal_length == 0) {
@@ -269,10 +271,16 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
                 if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
                     yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
 
-                return unicode_cursor + 1;
+
+                if (unicode_cursor < end && *unicode_cursor == '}') {
+                    unicode_cursor++;
+                } else {
+                    yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape.");
+                }
+                return unicode_cursor;
             }
 
-            if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
+            if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
                 uint32_t value;
                 unescape_unicode(backslash + 2, 4, &value);
 
@@ -538,6 +546,10 @@ size_t
 yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
     assert(unescape_type != YP_UNESCAPE_NONE);
 
+    if (backslash + 1 >= parser->end) {
+        return 0;
+    }
+
     switch (backslash[1]) {
         case '\\':
         case '\'':
diff --git a/yarp/yarp.c b/yarp/yarp.c
index 376dfe32c3..176e9f76b6 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -7002,17 +7002,16 @@ parser_lex(yp_parser_t *parser) {
                 // literally. In this case we'll skip past the next character
                 // and find the next breakpoint.
                 if (*breakpoint == '\\') {
-                    // Check that we're not at the end of the file.
-                    if (breakpoint + 1 >= parser->end) {
+                    yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                    if (difference == 0) {
+                        // we're at the end of the file
                         breakpoint = NULL;
                         continue;
                     }
 
-                    yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
-                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
-
                     // If the result is an escaped newline ...
-                    if (*(breakpoint + difference - 1) == '\n') {
+                    if (breakpoint[difference - 1] == '\n') {
                         if (parser->heredoc_end) {
                             // ... if we are on the same line as a heredoc, flush the heredoc and
                             // continue parsing after heredoc_end.
@@ -7141,16 +7140,15 @@ parser_lex(yp_parser_t *parser) {
                 // literally. In this case we'll skip past the next character
                 // and find the next breakpoint.
                 if (*breakpoint == '\\') {
-                    // Check that we're not at the end of the file.
-                    if (breakpoint + 1 >= parser->end) {
+                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
+                    if (difference == 0) {
+                        // we're at the end of the file
                         breakpoint = NULL;
                         continue;
                     }
 
-                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
-
                     // If the result is an escaped newline ...
-                    if (*(breakpoint + difference - 1) == '\n') {
+                    if (breakpoint[difference - 1] == '\n') {
                         if (parser->heredoc_end) {
                             // ... if we are on the same line as a heredoc, flush the heredoc and
                             // continue parsing after heredoc_end.
@@ -7293,20 +7291,19 @@ parser_lex(yp_parser_t *parser) {
                         breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
                         break;
                     case '\\': {
-                        // Check that we're not at the end of the file.
-                        if (breakpoint + 1 >= parser->end) {
-                            breakpoint = NULL;
-                            break;
-                        }
-
                         // If we hit escapes, then we need to treat the next token
                         // literally. In this case we'll skip past the next character and
                         // find the next breakpoint.
                         yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
                         size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                        if (difference == 0) {
+                            // we're at the end of the file
+                            breakpoint = NULL;
+                            break;
+                        }
 
                         // If the result is an escaped newline ...
-                        if (*(breakpoint + difference - 1) == '\n') {
+                        if (breakpoint[difference - 1] == '\n') {
                             if (parser->heredoc_end) {
                                 // ... if we are on the same line as a heredoc, flush the heredoc and
                                 // continue parsing after heredoc_end.
@@ -7463,12 +7460,6 @@ parser_lex(yp_parser_t *parser) {
                         break;
                     }
                     case '\\': {
-                        // Check that we're not at the end of the file.
-                        if (breakpoint + 1 >= parser->end) {
-                            breakpoint = NULL;
-                            break;
-                        }
-
                         // If we hit an escape, then we need to skip past
                         // however many characters the escape takes up. However
                         // it's important that if \n or \r\n are escaped that we
@@ -7481,6 +7472,11 @@ parser_lex(yp_parser_t *parser) {
                         } else {
                             yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
                             size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+                            if (difference == 0) {
+                                // we're at the end of the file
+                                breakpoint = NULL;
+                                break;
+                            }
 
                             yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
author	Mike Dalessio <mike.dalessio@gmail.com>	2023-08-31 18:40:29 -0400
committer	git <svn-admin@ruby-lang.org>	2023-08-31 22:40:35 +0000
commit	df4c77608e76068deed58b2781674b0eb247c325 (patch)
tree	4d159be612246f572dbce77d9bc1f49b4c2af6f7
parent	9930363aab6ac4b8d7034baff85cd86c17953dc9 (diff)