summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike Dalessio <mike.dalessio@gmail.com>2023-08-31 18:40:29 -0400
committergit <svn-admin@ruby-lang.org>2023-08-31 22:40:35 +0000
commitdf4c77608e76068deed58b2781674b0eb247c325 (patch)
tree4d159be612246f572dbce77d9bc1f49b4c2af6f7
parent9930363aab6ac4b8d7034baff85cd86c17953dc9 (diff)
[ruby/yarp] fix: octal, hex, and unicode strings at the end of a
file (https://github.com/ruby/yarp/pull/1371) * refactor: move EOF check into yp_unescape_calculate_difference parser_lex is a bit more readable when we can rely on that behavior * fix: octal and hex digits at the end of a file Previously this resulted in invalid memory access. * fix: unicode strings at the end of a file Previously this resulted in invalid memory access. * Unterminated curly-bracket unicode is a syntax error https://github.com/ruby/yarp/commit/21cf11acb5
-rw-r--r--test/yarp/errors_test.rb7
-rw-r--r--test/yarp/fuzzer_test.rb13
-rw-r--r--yarp/unescape.c40
-rw-r--r--yarp/yarp.c44
4 files changed, 66 insertions, 38 deletions
diff --git a/test/yarp/errors_test.rb b/test/yarp/errors_test.rb
index 0e1931ec6a..9e60f5d98b 100644
--- a/test/yarp/errors_test.rb
+++ b/test/yarp/errors_test.rb
@@ -621,6 +621,13 @@ module YARP
]
end
+ def test_unterminated_unicode_brackets_should_be_a_syntax_error
+ assert_errors expression('?\\u{3'), '?\\u{3', [
+ ["invalid Unicode escape.", 1..5],
+ ["invalid Unicode escape.", 1..5],
+ ]
+ end
+
def test_method_parameters_after_block
expected = DefNode(
Location(),
diff --git a/test/yarp/fuzzer_test.rb b/test/yarp/fuzzer_test.rb
index 568c2eaf08..f4abcd4ac8 100644
--- a/test/yarp/fuzzer_test.rb
+++ b/test/yarp/fuzzer_test.rb
@@ -22,6 +22,19 @@ module YARP
snippet "incomplete escaped list", "%w[\\"
snippet "incomplete escaped regex", "/a\\"
snippet "unterminated heredoc with unterminated escape at end of file", "<<A\n\\"
+ snippet "escaped octal at end of file 1", '"\\3'
+ snippet "escaped octal at end of file 2", '"\\33'
+ snippet "escaped hex at end of file 1", '"\\x'
+ snippet "escaped hex at end of file 2", '"\\x3'
+ snippet "escaped unicode at end of file 1", '"\\u{3'
+ snippet "escaped unicode at end of file 2", '"\\u{33'
+ snippet "escaped unicode at end of file 3", '"\\u{333'
+ snippet "escaped unicode at end of file 4", '"\\u{3333'
+ snippet "escaped unicode at end of file 5", '"\\u{33333'
+ snippet "escaped unicode at end of file 6", '"\\u{333333'
+ snippet "escaped unicode at end of file 7", '"\\u3'
+ snippet "escaped unicode at end of file 8", '"\\u33'
+ snippet "escaped unicode at end of file 9", '"\\u333'
snippet "statements node with multiple heredocs", <<~EOF
for <<A + <<B
diff --git a/yarp/unescape.c b/yarp/unescape.c
index b0aabf5e22..14c0faf2eb 100644
--- a/yarp/unescape.c
+++ b/yarp/unescape.c
@@ -69,17 +69,15 @@ char_is_ascii_printable(const uint8_t b) {
// Scan the 1-3 digits of octal into the value. Returns the number of digits
// scanned.
static inline size_t
-unescape_octal(const uint8_t *backslash, uint8_t *value) {
+unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
*value = (uint8_t) (backslash[1] - '0');
- if (!yp_char_is_octal_digit(backslash[2])) {
+ if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
return 2;
}
-
*value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
- if (!yp_char_is_octal_digit(backslash[3])) {
+ if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
return 3;
}
-
*value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
return 4;
}
@@ -93,12 +91,15 @@ unescape_hexadecimal_digit(const uint8_t value) {
// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
// digits scanned.
static inline size_t
-unescape_hexadecimal(const uint8_t *backslash, uint8_t *value) {
+unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
+ *value = 0;
+ if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
+ return 2;
+ }
*value = unescape_hexadecimal_digit(backslash[2]);
- if (!yp_char_is_hexadecimal_digit(backslash[3])) {
+ if (backslash + 3 >= end || !yp_char_is_hexadecimal_digit(backslash[3])) {
return 3;
}
-
*value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
return 4;
}
@@ -204,7 +205,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9': {
uint8_t value;
- const uint8_t *cursor = backslash + unescape_octal(backslash, &value);
+ const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);
if (dest) {
dest[(*dest_length)++] = unescape_char(value, flags);
@@ -214,7 +215,7 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
// \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
case 'x': {
uint8_t value;
- const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value);
+ const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end);
if (dest) {
dest[(*dest_length)++] = unescape_char(value, flags);
@@ -236,13 +237,14 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
- while ((*unicode_cursor != '}') && (unicode_cursor < end)) {
+ while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
const uint8_t *unicode_start = unicode_cursor;
size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);
// \u{nnnn} character literal allows only 1-6 hexadecimal digits
- if (hexadecimal_length > 6)
+ if (hexadecimal_length > 6) {
yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape.");
+ }
// there are not hexadecimal characters
if (hexadecimal_length == 0) {
@@ -269,10 +271,16 @@ unescape(yp_parser_t *parser, uint8_t *dest, size_t *dest_length, const uint8_t
if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1)
yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal");
- return unicode_cursor + 1;
+
+ if (unicode_cursor < end && *unicode_cursor == '}') {
+ unicode_cursor++;
+ } else {
+ yp_diagnostic_list_append(&parser->error_list, backslash, unicode_cursor, "invalid Unicode escape.");
+ }
+ return unicode_cursor;
}
- if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
+ if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
uint32_t value;
unescape_unicode(backslash + 2, 4, &value);
@@ -538,6 +546,10 @@ size_t
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
assert(unescape_type != YP_UNESCAPE_NONE);
+ if (backslash + 1 >= parser->end) {
+ return 0;
+ }
+
switch (backslash[1]) {
case '\\':
case '\'':
diff --git a/yarp/yarp.c b/yarp/yarp.c
index 376dfe32c3..176e9f76b6 100644
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@@ -7002,17 +7002,16 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character
// and find the next breakpoint.
if (*breakpoint == '\\') {
- // Check that we're not at the end of the file.
- if (breakpoint + 1 >= parser->end) {
+ yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
+ size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+ if (difference == 0) {
+ // we're at the end of the file
breakpoint = NULL;
continue;
}
- yp_unescape_type_t unescape_type = lex_mode->as.list.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
- size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
-
// If the result is an escaped newline ...
- if (*(breakpoint + difference - 1) == '\n') {
+ if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@@ -7141,16 +7140,15 @@ parser_lex(yp_parser_t *parser) {
// literally. In this case we'll skip past the next character
// and find the next breakpoint.
if (*breakpoint == '\\') {
- // Check that we're not at the end of the file.
- if (breakpoint + 1 >= parser->end) {
+ size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
+ if (difference == 0) {
+ // we're at the end of the file
breakpoint = NULL;
continue;
}
- size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
-
// If the result is an escaped newline ...
- if (*(breakpoint + difference - 1) == '\n') {
+ if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@@ -7293,20 +7291,19 @@ parser_lex(yp_parser_t *parser) {
breakpoint = yp_strpbrk(parser, breakpoint + 1, breakpoints, parser->end - (breakpoint + 1));
break;
case '\\': {
- // Check that we're not at the end of the file.
- if (breakpoint + 1 >= parser->end) {
- breakpoint = NULL;
- break;
- }
-
// If we hit escapes, then we need to treat the next token
// literally. In this case we'll skip past the next character and
// find the next breakpoint.
yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+ if (difference == 0) {
+ // we're at the end of the file
+ breakpoint = NULL;
+ break;
+ }
// If the result is an escaped newline ...
- if (*(breakpoint + difference - 1) == '\n') {
+ if (breakpoint[difference - 1] == '\n') {
if (parser->heredoc_end) {
// ... if we are on the same line as a heredoc, flush the heredoc and
// continue parsing after heredoc_end.
@@ -7463,12 +7460,6 @@ parser_lex(yp_parser_t *parser) {
break;
}
case '\\': {
- // Check that we're not at the end of the file.
- if (breakpoint + 1 >= parser->end) {
- breakpoint = NULL;
- break;
- }
-
// If we hit an escape, then we need to skip past
// however many characters the escape takes up. However
// it's important that if \n or \r\n are escaped that we
@@ -7481,6 +7472,11 @@ parser_lex(yp_parser_t *parser) {
} else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
+ if (difference == 0) {
+ // we're at the end of the file
+ breakpoint = NULL;
+ break;
+ }
yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);