#include "yarp.h" /******************************************************************************/ /* Character checks */ /******************************************************************************/ static inline bool yp_char_is_hexadecimal_digits(const char *c, size_t length) { for (size_t index = 0; index < length; index++) { if (!yp_char_is_hexadecimal_digit(c[index])) { return false; } } return true; } // We don't call the char_width function unless we have to because it's // expensive to go through the indirection of the function pointer. Instead we // provide a fast path that will check if we can just return 1. static inline size_t yp_char_width(yp_parser_t *parser, const char *start, const char *end) { const unsigned char *uc = (const unsigned char *) start; if (parser->encoding_changed || (*uc >= 0x80)) { return parser->encoding.char_width(start, end - start); } else { return 1; } } /******************************************************************************/ /* Lookup tables for characters */ /******************************************************************************/ // This is a lookup table for unescapes that only take up a single character. static const unsigned char unescape_chars[] = { ['\''] = '\'', ['\\'] = '\\', ['a'] = '\a', ['b'] = '\b', ['e'] = '\033', ['f'] = '\f', ['n'] = '\n', ['r'] = '\r', ['s'] = ' ', ['t'] = '\t', ['v'] = '\v' }; // This is a lookup table for whether or not an ASCII character is printable. static const bool ascii_printable_chars[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; static inline bool char_is_ascii_printable(const char c) { unsigned char v = (unsigned char) c; return (v < 0x80) && ascii_printable_chars[v]; } /******************************************************************************/ /* Unescaping for segments */ /******************************************************************************/ // Scan the 1-3 digits of octal into the value. Returns the number of digits // scanned. static inline size_t unescape_octal(const char *backslash, unsigned char *value) { *value = (unsigned char) (backslash[1] - '0'); if (!yp_char_is_octal_digit(backslash[2])) { return 2; } *value = (unsigned char) ((*value << 3) | (backslash[2] - '0')); if (!yp_char_is_octal_digit(backslash[3])) { return 3; } *value = (unsigned char) ((*value << 3) | (backslash[3] - '0')); return 4; } // Convert a hexadecimal digit into its equivalent value. static inline unsigned char unescape_hexadecimal_digit(const char value) { return (unsigned char) ((value <= '9') ? (value - '0') : (value & 0x7) + 9); } // Scan the 1-2 digits of hexadecimal into the value. Returns the number of // digits scanned. static inline size_t unescape_hexadecimal(const char *backslash, unsigned char *value) { *value = unescape_hexadecimal_digit(backslash[2]); if (!yp_char_is_hexadecimal_digit(backslash[3])) { return 3; } *value = (unsigned char) ((*value << 4) | unescape_hexadecimal_digit(backslash[3])); return 4; } // Scan the 4 digits of a Unicode escape into the value. Returns the number of // digits scanned. This function assumes that the characters have already been // validated. static inline void unescape_unicode(const char *string, size_t length, uint32_t *value) { *value = 0; for (size_t index = 0; index < length; index++) { if (index != 0) *value <<= 4; *value |= unescape_hexadecimal_digit(string[index]); } } // Accepts the pointer to the string to write the unicode value along with the // 32-bit value to write. Writes the UTF-8 representation of the value to the // string and returns the number of bytes written. static inline size_t unescape_unicode_write(char *dest, uint32_t value, const char *start, const char *end, yp_list_t *error_list) { unsigned char *bytes = (unsigned char *) dest; if (value <= 0x7F) { // 0xxxxxxx bytes[0] = (unsigned char) value; return 1; } if (value <= 0x7FF) { // 110xxxxx 10xxxxxx bytes[0] = (unsigned char) (0xC0 | (value >> 6)); bytes[1] = (unsigned char) (0x80 | (value & 0x3F)); return 2; } if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx bytes[0] = (unsigned char) (0xE0 | (value >> 12)); bytes[1] = (unsigned char) (0x80 | ((value >> 6) & 0x3F)); bytes[2] = (unsigned char) (0x80 | (value & 0x3F)); return 3; } // At this point it must be a 4 digit UTF-8 representation. If it's not, then // the input is invalid. if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx bytes[0] = (unsigned char) (0xF0 | (value >> 18)); bytes[1] = (unsigned char) (0x80 | ((value >> 12) & 0x3F)); bytes[2] = (unsigned char) (0x80 | ((value >> 6) & 0x3F)); bytes[3] = (unsigned char) (0x80 | (value & 0x3F)); return 4; } // If we get here, then the value is too big. This is an error, but we don't // want to just crash, so instead we'll add an error to the error list and put // in a replacement character instead. yp_diagnostic_list_append(error_list, start, end, "Invalid Unicode escape sequence."); bytes[0] = 0xEF; bytes[1] = 0xBF; bytes[2] = 0xBD; return 3; } typedef enum { YP_UNESCAPE_FLAG_NONE = 0, YP_UNESCAPE_FLAG_CONTROL = 1, YP_UNESCAPE_FLAG_META = 2, YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4 } yp_unescape_flag_t; // Unescape a single character value based on the given flags. static inline unsigned char unescape_char(const unsigned char value, const unsigned char flags) { unsigned char unescaped = value; if (flags & YP_UNESCAPE_FLAG_CONTROL) { unescaped &= 0x1f; } if (flags & YP_UNESCAPE_FLAG_META) { unescaped |= 0x80; } return unescaped; } // Read a specific escape sequence into the given destination. static const char * unescape(yp_parser_t *parser, char *dest, size_t *dest_length, const char *backslash, const char *end, const unsigned char flags, bool write_to_str) { switch (backslash[1]) { case 'a': case 'b': case 'e': case 'f': case 'n': case 'r': case 's': case 't': case 'v': if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char(unescape_chars[(unsigned char) backslash[1]], flags); } return backslash + 2; // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7]) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { unsigned char value; const char *cursor = backslash + unescape_octal(backslash, &value); if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char(value, flags); } return cursor; } // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F]) case 'x': { unsigned char value; const char *cursor = backslash + unescape_hexadecimal(backslash, &value); if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char(value, flags); } return cursor; } // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F]) // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F]) case 'u': { if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Unicode escape sequence cannot be used with control or meta flags."); return backslash + 2; } if ((backslash + 3) < end && backslash[2] == '{') { const char *unicode_cursor = backslash + 3; const char *extra_codepoints_start = NULL; int codepoints_count = 0; unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor); while ((*unicode_cursor != '}') && (unicode_cursor < end)) { const char *unicode_start = unicode_cursor; size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor); // \u{nnnn} character literal allows only 1-6 hexadecimal digits if (hexadecimal_length > 6) yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "invalid Unicode escape."); // there are not hexadecimal characters if (hexadecimal_length == 0) { yp_diagnostic_list_append(&parser->error_list, unicode_cursor, unicode_cursor + hexadecimal_length, "unterminated Unicode escape"); return unicode_cursor; } unicode_cursor += hexadecimal_length; codepoints_count++; if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2) extra_codepoints_start = unicode_start; uint32_t value; unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value); if (write_to_str) { *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, &parser->error_list); } unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor); } // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm} if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) yp_diagnostic_list_append(&parser->error_list, extra_codepoints_start, unicode_cursor - 1, "Multiple codepoints at single character literal"); return unicode_cursor + 1; } if ((backslash + 2) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) { uint32_t value; unescape_unicode(backslash + 2, 4, &value); if (write_to_str) { *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, &parser->error_list); } return backslash + 6; } yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid Unicode escape sequence"); return backslash + 2; } // \c\M-x meta control character, where x is an ASCII printable character // \c? delete, ASCII 7Fh (DEL) // \cx control character, where x is an ASCII printable character case 'c': if (backslash + 2 >= end) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_CONTROL) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); return backslash + 2; } switch (backslash[2]) { case '\\': return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str); case '?': if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char(0x7f, flags); } return backslash + 3; default: { if (!char_is_ascii_printable(backslash[2])) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); return backslash + 2; } if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL); } return backslash + 3; } } // \C-x control character, where x is an ASCII printable character // \C-? delete, ASCII 7Fh (DEL) case 'C': if (backslash + 3 >= end) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_CONTROL) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Control escape sequence cannot be doubled."); return backslash + 2; } if (backslash[2] != '-') { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); return backslash + 2; } switch (backslash[3]) { case '\\': return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, write_to_str); case '?': if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char(0x7f, flags); } return backslash + 4; default: if (!char_is_ascii_printable(backslash[3])) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid control escape sequence"); return backslash + 2; } if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL); } return backslash + 4; } // \M-\C-x meta control character, where x is an ASCII printable character // \M-\cx meta control character, where x is an ASCII printable character // \M-x meta character, where x is an ASCII printable character case 'M': { if (backslash + 3 >= end) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 1, "Invalid control escape sequence"); return end; } if (flags & YP_UNESCAPE_FLAG_META) { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Meta escape sequence cannot be doubled."); return backslash + 2; } if (backslash[2] != '-') { yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence"); return backslash + 2; } if (backslash[3] == '\\') { return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, write_to_str); } if (char_is_ascii_printable(backslash[3])) { if (write_to_str) { dest[(*dest_length)++] = (char) unescape_char((const unsigned char) backslash[3], flags | YP_UNESCAPE_FLAG_META); } return backslash + 4; } yp_diagnostic_list_append(&parser->error_list, backslash, backslash + 2, "Invalid meta escape sequence"); return backslash + 3; } // \n case '\n': return backslash + 2; // \r case '\r': if (backslash + 2 < end && backslash[2] == '\n') { return backslash + 3; } /* fallthrough */ // In this case we're escaping something that doesn't need escaping. default: { size_t width = yp_char_width(parser, backslash + 1, end); if (write_to_str) { memcpy(dest + *dest_length, backslash + 1, width); *dest_length += width; } return backslash + 1 + width; } } } /******************************************************************************/ /* Public functions and entrypoints */ /******************************************************************************/ // Unescape the contents of the given token into the given string using the // given unescape mode. The supported escapes are: // // \a bell, ASCII 07h (BEL) // \b backspace, ASCII 08h (BS) // \t horizontal tab, ASCII 09h (TAB) // \n newline (line feed), ASCII 0Ah (LF) // \v vertical tab, ASCII 0Bh (VT) // \f form feed, ASCII 0Ch (FF) // \r carriage return, ASCII 0Dh (CR) // \e escape, ASCII 1Bh (ESC) // \s space, ASCII 20h (SPC) // \\ backslash // \nnn octal bit pattern, where nnn is 1-3 octal digits ([0-7]) // \xnn hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F]) // \unnnn Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F]) // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F]) // \cx or \C-x control character, where x is an ASCII printable character // \M-x meta character, where x is an ASCII printable character // \M-\C-x meta control character, where x is an ASCII printable character // \M-\cx same as above // \c\M-x same as above // \c? or \C-? delete, ASCII 7Fh (DEL) // YP_EXPORTED_FUNCTION void yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) { if (unescape_type == YP_UNESCAPE_NONE) { // If we're not unescaping then we can reference the source directly. return; } const char *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding); if (backslash == NULL) { // Here there are no escapes, so we can reference the source directly. return; } // Here we have found an escape character, so we need to handle all escapes // within the string. char *allocated = malloc(string->length); if (allocated == NULL) { yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, "Failed to allocate memory for unescaping."); return; } // This is the memory address where we're putting the unescaped string. char *dest = allocated; size_t dest_length = 0; // This is the current position in the source string that we're looking at. // It's going to move along behind the backslash so that we can copy each // segment of the string that doesn't contain an escape. const char *cursor = string->source; const char *end = string->source + string->length; // For each escape found in the source string, we will handle it and update // the moving cursor->backslash window. while (backslash != NULL && backslash + 1 < end) { assert(dest_length < string->length); // This is the size of the segment of the string from the previous escape // or the start of the string to the current escape. size_t segment_size = (size_t) (backslash - cursor); // Here we're going to copy everything up until the escape into the // destination buffer. memcpy(dest + dest_length, cursor, segment_size); dest_length += segment_size; switch (backslash[1]) { case '\\': case '\'': dest[dest_length++] = (char) unescape_chars[(unsigned char) backslash[1]]; cursor = backslash + 2; break; default: if (unescape_type == YP_UNESCAPE_MINIMAL) { // In this case we're escaping something that doesn't need escaping. dest[dest_length++] = '\\'; cursor = backslash + 1; break; } // This is the only type of unescaping left. In this case we need to // handle all of the different unescapes. assert(unescape_type == YP_UNESCAPE_ALL); cursor = unescape(parser, dest, &dest_length, backslash, end, YP_UNESCAPE_FLAG_NONE, true); break; } if (end > cursor) { backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding); } else { backslash = NULL; } } // We need to copy the final segment of the string after the last escape. if (end > cursor) { memcpy(dest + dest_length, cursor, (size_t) (end - cursor)); } else { cursor = end; } // If the string was already allocated, then we need to free that memory // here. That's because we're about to override it with the escaped string. yp_string_free(string); // We also need to update the length at the end. This is because every escape // reduces the length of the final string, and we don't want garbage at the // end. yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor))); } // This function is similar to yp_unescape_manipulate_string, except it doesn't // actually perform any string manipulations. Instead, it calculates how long // the unescaped character is, and returns that value size_t yp_unescape_calculate_difference(yp_parser_t *parser, const char *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) { assert(unescape_type != YP_UNESCAPE_NONE); switch (backslash[1]) { case '\\': case '\'': return 2; default: { if (unescape_type == YP_UNESCAPE_MINIMAL) { return 1 + yp_char_width(parser, backslash + 1, parser->end); } // This is the only type of unescaping left. In this case we need to // handle all of the different unescapes. assert(unescape_type == YP_UNESCAPE_ALL); unsigned char flags = YP_UNESCAPE_FLAG_NONE; if (expect_single_codepoint) flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE; const char *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, false); assert(cursor > backslash); return (size_t) (cursor - backslash); } } } // This is one of the main entry points into the extension. It accepts a source // string, a type of unescaping, and a pointer to a result string. It returns a // boolean indicating whether or not the unescaping was successful. YP_EXPORTED_FUNCTION bool yp_unescape_string(const char *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) { yp_parser_t parser; yp_parser_init(&parser, start, length, NULL); yp_string_shared_init(result, start, start + length); yp_unescape_manipulate_string(&parser, result, unescape_type); bool success = yp_list_empty_p(&parser.error_list); yp_parser_free(&parser); return success; }