diff options
| author | Mike Dalessio <mike.dalessio@gmail.com> | 2023-08-31 18:40:29 -0400 |
|---|---|---|
| committer | git <svn-admin@ruby-lang.org> | 2023-08-31 22:40:35 +0000 |
| commit | df4c77608e76068deed58b2781674b0eb247c325 (patch) | |
| tree | 4d159be612246f572dbce77d9bc1f49b4c2af6f7 | |
| parent | 9930363aab6ac4b8d7034baff85cd86c17953dc9 (diff) | |
[ruby/yarp] fix: octal, hex, and unicode strings at the end of a
file
(https://github.com/ruby/yarp/pull/1371)
* refactor: move EOF check into yp_unescape_calculate_difference
parser_lex is a bit more readable when we can rely on that behavior
* fix: octal and hex digits at the end of a file
Previously this resulted in invalid memory access.
* fix: unicode strings at the end of a file
Previously this resulted in invalid memory access.
* Unterminated curly-bracket unicode is a syntax error
https://github.com/ruby/yarp/commit/21cf11acb5
| -rw-r--r-- | test/yarp/errors_test.rb | 7 | ||||
| -rw-r--r-- | test/yarp/fuzzer_test.rb | 13 | ||||
| -rw-r--r-- | yarp/unescape.c | 40 | ||||
| -rw-r--r-- | yarp/yarp.c | 44 |
4 files changed, 66 insertions, 38 deletions
diff --git a/test/yarp/errors_test.rb b/test/yarp/errors_test.rb index 0e1931ec6a..9e60f5d98b 100644 --- a/test/yarp/errors_test.rb +++ b/test/yarp/errors_test.rb @@ -621,6 +621,13 @@ module YARP ] end + def test_unterminated_unicode_brackets_should_be_a_syntax_error + assert_errors expression('?\\u{3'), '?\\u{3', [ + ["invalid Unicode escape.", 1..5], + ["invalid Unicode escape.", 1..5], + ] + end + def test_method_parameters_after_block expected = DefNode( Location(), diff --git a/test/yarp/fuzzer_test.rb b/test/yarp/fuzzer_test.rb index 568c2eaf08..f4abcd4ac8 100644 --- a/test/yarp/fuzzer_test.rb +++ b/test/yarp/fuzzer_test.rb @@ -22,6 +22,19 @@ module YARP snippet "incomplete escaped list", "%w[\\" snippet "incomplete escaped regex", "/a\\" snippet "unterminated heredoc with unterminated escape at end of file", "<<A\n\\" + snippet "escaped octal at end of file 1", '"\\3' + snippet "escaped octal at end of file 2", '"\\33' + snippet "escaped hex at end of file 1", '"\\x' + snippet "escaped hex at end of file 2", '"\\x3' + snippet "escaped unicode at end of file 1", '"\\u{3' + snippet "escaped unicode at end of file 2", '"\\u{33' + snippet "escaped unicode at end of file 3", '"\\u{333' + snippet "escaped unicode at end of file 4", '"\\u{3333' + snippet "escaped unicode at end of file 5", '"\\u{33333' + snippet "escaped unicode at end of file 6", '"\\u{333333' + snippet "escaped unicode at end of file 7", '"\\u3' + snippet "escaped unicode at end of file 8", '"\\u33' + snippet "escaped unicode at end of file 9", '"\\u333' snippet "statements node with multiple heredocs", <<~EOF for <<A + <<B diff --git a/yarp/unescape.c b/yarp/unescape.c index b0aabf5e22..14c0faf2eb 100644 --- a/yarp/unescape.c +++ b/yarp/unescape.c @@ -69,17 +69,15 @@ char_is_ascii_printable(const uint8_t b) { // Scan the 1-3 digits of octal into the value. Returns the number of digits // scanned. static inline size_t -unescape_octal(const uint8_t *backslash, uint8_t *value) { +unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) { *value = (uint8_t) (backslash[1] - '0'); - if (!yp_char_is_octal_digit(backslash[2])) { + if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) { return 2; } - *value = (uint8_t) ((*value << 3) | (backslash[2] - '0')); - if (!yp_char_is_octal_digit(backslash[3])) { + if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) { return 3; } - *value = (uint8_t) ((*value << 3) | (backslash[3] - '0')); return 4; } @@ -93,12 +91,15 @@ unescape_hexadecimal_digit(const uint8_t value) { // Scan the 1-2 digits of hexadecimal into the value. Returns the number of // digits scanned. static inline size_t -unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) { +unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) { + *value = 0; + if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) { + return 2; + } *value = unescape_hexadecimal_digit(backslash[2]); - if (!yp_char_is_hexadecimal_digit(backslash[3])) { + if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) { return 3; } - *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3])); return 4; } @@ -204,7 +205,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { uint8_t value; - const uint8_t *cursor = backslash + unescape_octal(backslash, &value); + const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end); if (dest) { dest[(*dest_length)++] = unescape_char(value, flags); @@ -214,7 +215,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F]) case 'x': { uint8_t value; - const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value); + const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end); if (dest) { dest[(*dest_length)++] = unescape_char(value, flags); @@ -236,13 +237,14 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor); - while ((*unicode_cursor != '}') && (unicode_cursor < end)) { + while ((unicode_cursor < end) && (*unicode_cursor != '}')) { const uint8_t *unicode_start = unicode_cursor; size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor); // \u{nnnn} character literal allows only 1-6 hexadecimal digits - if (hexadecimal_length > 6) + if (hexadecimal_length > 6) { yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape."); + } // there are not hexadecimal characters if (hexadecimal_length == 0) { @@ -269,10 +271,16 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal"); - return unicode_cursor + 1; + + if (unicode_cursor < end && *unicode_cursor == '}') { + unicode_cursor++; + } else { + yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape."); + } + return unicode_cursor; } - if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) { + if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) { uint32_t value; unescape_unicode(backslash + 2, 4, &value); @@ -538,6 +546,10 @@ size_t yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) { assert(unescape_type != YP_UNESCAPE_NONE); + if (backslash + 1 >= parser->end) { + return 0; + } + switch (backslash[1]) { case '\\': case '\'': diff --git a/yarp/yarp.c b/yarp/yarp.c index 376dfe32c3..176e9f76b6 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -7002,17 +7002,16 @@ parser_lex(yp_parser_t *parser) { // literally. In this case we'll skip past the next character // and find the next breakpoint. if (*breakpoint == '\\') { - // Check that we're not at the end of the file. - if (breakpoint + 1 >= parser->end) { + yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL; + size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false); + if (difference == 0) { + // we're at the end of the file breakpoint = NULL; continue; } - yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL; - size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false); - // If the result is an escaped newline ... - if (*(breakpoint + difference - 1) == '\n') { + if (breakpoint[difference - 1] == '\n') { if (parser->heredoc_end) { // ... if we are on the same line as a heredoc, flush the heredoc and // continue parsing after heredoc_end. @@ -7141,16 +7140,15 @@ parser_lex(yp_parser_t *parser) { // literally. In this case we'll skip past the next character // and find the next breakpoint. if (*breakpoint == '\\') { - // Check that we're not at the end of the file. - if (breakpoint + 1 >= parser->end) { + size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false); + if (difference == 0) { + // we're at the end of the file breakpoint = NULL; continue; } - size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false); - // If the result is an escaped newline ... - if (*(breakpoint + difference - 1) == '\n') { + if (breakpoint[difference - 1] == '\n') { if (parser->heredoc_end) { // ... if we are on the same line as a heredoc, flush the heredoc and // continue parsing after heredoc_end. @@ -7293,20 +7291,19 @@ parser_lex(yp_parser_t *parser) { breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1)); break; case '\\': { - // Check that we're not at the end of the file. - if (breakpoint + 1 >= parser->end) { - breakpoint = NULL; - break; - } - // If we hit escapes, then we need to treat the next token // literally. In this case we'll skip past the next character and // find the next breakpoint. yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL; size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false); + if (difference == 0) { + // we're at the end of the file + breakpoint = NULL; + break; + } // If the result is an escaped newline ... - if (*(breakpoint + difference - 1) == '\n') { + if (breakpoint[difference - 1] == '\n') { if (parser->heredoc_end) { // ... if we are on the same line as a heredoc, flush the heredoc and // continue parsing after heredoc_end. @@ -7463,12 +7460,6 @@ parser_lex(yp_parser_t *parser) { break; } case '\\': { - // Check that we're not at the end of the file. - if (breakpoint + 1 >= parser->end) { - breakpoint = NULL; - break; - } - // If we hit an escape, then we need to skip past // however many characters the escape takes up. However // it's important that if \n or \r\n are escaped that we @@ -7481,6 +7472,11 @@ parser_lex(yp_parser_t *parser) { } else { yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL; size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false); + if (difference == 0) { + // we're at the end of the file + breakpoint = NULL; + break; + } yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); |
