#include "yarp.h"

/******************************************************************************/
/* Character checks                                                           */
/******************************************************************************/

static inline bool
yp_char_is_hexadecimal_digits(const uint8_t *string, size_t length) {
    for (size_t index = 0; index < length; index++) {
        if (!yp_char_is_hexadecimal_digit(string[index])) {
            return false;
        }
    }
    return true;
}

// We don't call the char_width function unless we have to because it's
// expensive to go through the indirection of the function pointer. Instead we
// provide a fast path that will check if we can just return 1.
static inline size_t
yp_char_width(yp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
    if (parser->encoding_changed || (*start >= 0x80)) {
        return parser->encoding.char_width(start, end - start);
    } else {
        return 1;
    }
}

/******************************************************************************/
/* Lookup tables for characters                                               */
/******************************************************************************/

// This is a lookup table for unescapes that only take up a single character.
static const uint8_t unescape_chars[] = {
    ['\''] = '\'',
    ['\\'] = '\\',
    ['a'] = '\a',
    ['b'] = '\b',
    ['e'] = '\033',
    ['f'] = '\f',
    ['n'] = '\n',
    ['r'] = '\r',
    ['s'] = ' ',
    ['t'] = '\t',
    ['v'] = '\v'
};

// This is a lookup table for whether or not an ASCII character is printable.
static const bool ascii_printable_chars[] = {
    0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
};

static inline bool
char_is_ascii_printable(const uint8_t b) {
    return (b < 0x80) && ascii_printable_chars[b];
}

/******************************************************************************/
/* Unescaping for segments                                                    */
/******************************************************************************/

// Scan the 1-3 digits of octal into the value. Returns the number of digits
// scanned.
static inline size_t
unescape_octal(const uint8_t *backslash, uint8_t *value, const uint8_t *end) {
    *value = (uint8_t) (backslash[1] - '0');
    if (backslash + 2 >= end || !yp_char_is_octal_digit(backslash[2])) {
        return 2;
    }
    *value = (uint8_t) ((*value << 3) | (backslash[2] - '0'));
    if (backslash + 3 >= end || !yp_char_is_octal_digit(backslash[3])) {
        return 3;
    }
    *value = (uint8_t) ((*value << 3) | (backslash[3] - '0'));
    return 4;
}

// Convert a hexadecimal digit into its equivalent value.
static inline uint8_t
unescape_hexadecimal_digit(const uint8_t value) {
    return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9);
}

// Scan the 1-2 digits of hexadecimal into the value. Returns the number of
// digits scanned.
static inline size_t
unescape_hexadecimal(const uint8_t *backslash, uint8_t *value, const uint8_t *end, yp_list_t *error_list) {
    *value = 0;
    if (backslash + 2 >= end || !yp_char_is_hexadecimal_digit(backslash[2])) {
        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_HEXADECIMAL);
        return 2;
    }
    *value = unescape_hexadecimal_digit(backslash[2]);
    if (backslash + 3 >=  end || !yp_char_is_hexadecimal_digit(backslash[3])) {
        return 3;
    }
    *value = (uint8_t) ((*value << 4) | unescape_hexadecimal_digit(backslash[3]));
    return 4;
}

// Scan the 4 digits of a Unicode escape into the value. Returns the number of
// digits scanned. This function assumes that the characters have already been
// validated.
static inline void
unescape_unicode(const uint8_t *string, size_t length, uint32_t *value) {
    *value = 0;
    for (size_t index = 0; index < length; index++) {
        if (index != 0) *value <<= 4;
        *value |= unescape_hexadecimal_digit(string[index]);
    }
}

// Accepts the pointer to the string to write the unicode value along with the
// 32-bit value to write. Writes the UTF-8 representation of the value to the
// string and returns the number of bytes written.
static inline size_t
unescape_unicode_write(uint8_t *dest, uint32_t value, const uint8_t *start, const uint8_t *end, yp_list_t *error_list) {
    if (value <= 0x7F) {
        // 0xxxxxxx
        dest[0] = (uint8_t) value;
        return 1;
    }

    if (value <= 0x7FF) {
        // 110xxxxx 10xxxxxx
        dest[0] = (uint8_t) (0xC0 | (value >> 6));
        dest[1] = (uint8_t) (0x80 | (value & 0x3F));
        return 2;
    }

    if (value <= 0xFFFF) {
        // 1110xxxx 10xxxxxx 10xxxxxx
        dest[0] = (uint8_t) (0xE0 | (value >> 12));
        dest[1] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
        dest[2] = (uint8_t) (0x80 | (value & 0x3F));
        return 3;
    }

    // At this point it must be a 4 digit UTF-8 representation. If it's not, then
    // the input is invalid.
    if (value <= 0x10FFFF) {
        // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        dest[0] = (uint8_t) (0xF0 | (value >> 18));
        dest[1] = (uint8_t) (0x80 | ((value >> 12) & 0x3F));
        dest[2] = (uint8_t) (0x80 | ((value >> 6) & 0x3F));
        dest[3] = (uint8_t) (0x80 | (value & 0x3F));
        return 4;
    }

    // If we get here, then the value is too big. This is an error, but we don't
    // want to just crash, so instead we'll add an error to the error list and put
    // in a replacement character instead.
    if (error_list) yp_diagnostic_list_append(error_list, start, end, YP_ERR_ESCAPE_INVALID_UNICODE);
    dest[0] = 0xEF;
    dest[1] = 0xBF;
    dest[2] = 0xBD;
    return 3;
}

typedef enum {
    YP_UNESCAPE_FLAG_NONE = 0,
    YP_UNESCAPE_FLAG_CONTROL = 1,
    YP_UNESCAPE_FLAG_META = 2,
    YP_UNESCAPE_FLAG_EXPECT_SINGLE = 4
} yp_unescape_flag_t;

// Unescape a single character value based on the given flags.
static inline uint8_t
unescape_char(uint8_t value, const uint8_t flags) {
    if (flags & YP_UNESCAPE_FLAG_CONTROL) {
        value &= 0x1f;
    }

    if (flags & YP_UNESCAPE_FLAG_META) {
        value |= 0x80;
    }

    return value;
}

// Read a specific escape sequence into the given destination.
static const uint8_t *
unescape(
    yp_parser_t *parser,
    uint8_t *dest,
    size_t *dest_length,
    const uint8_t *backslash,
    const uint8_t *end,
    const uint8_t flags,
    yp_list_t *error_list
) {
    switch (backslash[1]) {
        case 'a':
        case 'b':
        case 'e':
        case 'f':
        case 'n':
        case 'r':
        case 's':
        case 't':
        case 'v':
            if (dest) {
                dest[(*dest_length)++] = unescape_char(unescape_chars[backslash[1]], flags);
            }
            return backslash + 2;
        // \nnn         octal bit pattern, where nnn is 1-3 octal digits ([0-7])
        case '0': case '1': case '2': case '3': case '4':
        case '5': case '6': case '7': case '8': case '9': {
            uint8_t value;
            const uint8_t *cursor = backslash + unescape_octal(backslash, &value, end);

            if (dest) {
                dest[(*dest_length)++] = unescape_char(value, flags);
            }
            return cursor;
        }
        // \xnn         hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
        case 'x': {
            uint8_t value;
            const uint8_t *cursor = backslash + unescape_hexadecimal(backslash, &value, end, error_list);

            if (dest) {
                dest[(*dest_length)++] = unescape_char(value, flags);
            }
            return cursor;
        }
        // \u{nnnn ...} Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
        // \unnnn       Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
        case 'u': {
            if ((flags & YP_UNESCAPE_FLAG_CONTROL) | (flags & YP_UNESCAPE_FLAG_META)) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS);
                return backslash + 2;
            }

            if ((backslash + 3) < end && backslash[2] == '{') {
                const uint8_t *unicode_cursor = backslash + 3;
                const uint8_t *extra_codepoints_start = NULL;
                int codepoints_count = 0;

                unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);

                while ((unicode_cursor < end) && (*unicode_cursor != '}')) {
                    const uint8_t *unicode_start = unicode_cursor;
                    size_t hexadecimal_length = yp_strspn_hexadecimal_digit(unicode_cursor, end - unicode_cursor);

                    // \u{nnnn} character literal allows only 1-6 hexadecimal digits
                    if (hexadecimal_length > 6) {
                        if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, YP_ERR_ESCAPE_INVALID_UNICODE_LONG);
                    }
                    // there are not hexadecimal characters
                    else if (hexadecimal_length == 0) {
                        if (error_list) yp_diagnostic_list_append(error_list, unicode_cursor, unicode_cursor + hexadecimal_length, YP_ERR_ESCAPE_INVALID_UNICODE);
                        return unicode_cursor;
                    }

                    unicode_cursor += hexadecimal_length;

                    codepoints_count++;
                    if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count == 2)
                        extra_codepoints_start = unicode_start;

                    uint32_t value;
                    unescape_unicode(unicode_start, (size_t) (unicode_cursor - unicode_start), &value);
                    if (dest) {
                        *dest_length += unescape_unicode_write(dest + *dest_length, value, unicode_start, unicode_cursor, error_list);
                    }

                    unicode_cursor += yp_strspn_whitespace(unicode_cursor, end - unicode_cursor);
                }

                // ?\u{nnnn} character literal should contain only one codepoint and cannot be like ?\u{nnnn mmmm}
                if (flags & YP_UNESCAPE_FLAG_EXPECT_SINGLE && codepoints_count > 1) {
                    if (error_list) yp_diagnostic_list_append(error_list, extra_codepoints_start, unicode_cursor - 1, YP_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
                }

                if (unicode_cursor < end && *unicode_cursor == '}') {
                    unicode_cursor++;
                } else {
                    if (error_list) yp_diagnostic_list_append(error_list, backslash, unicode_cursor, YP_ERR_ESCAPE_INVALID_UNICODE_TERM);
                }

                return unicode_cursor;
            }
            else if ((backslash + 5) < end && yp_char_is_hexadecimal_digits(backslash + 2, 4)) {
                uint32_t value;
                unescape_unicode(backslash + 2, 4, &value);

                if (dest) {
                    *dest_length += unescape_unicode_write(dest + *dest_length, value, backslash + 2, backslash + 6, error_list);
                }
                return backslash + 6;
            }

            if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_UNICODE);
            return backslash + 2;
        }
        // \c\M-x       meta control character, where x is an ASCII printable character
        // \c?          delete, ASCII 7Fh (DEL)
        // \cx          control character, where x is an ASCII printable character
        case 'c':
            if (backslash + 2 >= end) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
                return backslash + 2;
            }

            switch (backslash[2]) {
                case '\\':
                    return unescape(parser, dest, dest_length, backslash + 2, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
                case '?':
                    if (dest) {
                        dest[(*dest_length)++] = unescape_char(0x7f, flags);
                    }
                    return backslash + 3;
                default: {
                    if (!char_is_ascii_printable(backslash[2])) {
                        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
                        return backslash + 2;
                    }

                    if (dest) {
                        dest[(*dest_length)++] = unescape_char(backslash[2], flags | YP_UNESCAPE_FLAG_CONTROL);
                    }
                    return backslash + 3;
                }
            }
        // \C-x         control character, where x is an ASCII printable character
        // \C-?         delete, ASCII 7Fh (DEL)
        case 'C':
            if (backslash + 3 >= end) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_CONTROL) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL_REPEAT);
                return backslash + 2;
            }

            if (backslash[2] != '-') {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_CONTROL);
                return backslash + 2;
            }

            switch (backslash[3]) {
                case '\\':
                    return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_CONTROL, error_list);
                case '?':
                    if (dest) {
                        dest[(*dest_length)++] = unescape_char(0x7f, flags);
                    }
                    return backslash + 4;
                default:
                    if (!char_is_ascii_printable(backslash[3])) {
                        if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_CONTROL);
                        return backslash + 2;
                    }

                    if (dest) {
                        dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_CONTROL);
                    }
                    return backslash + 4;
            }
        // \M-\C-x      meta control character, where x is an ASCII printable character
        // \M-\cx       meta control character, where x is an ASCII printable character
        // \M-x         meta character, where x is an ASCII printable character
        case 'M': {
            if (backslash + 3 >= end) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 1, YP_ERR_ESCAPE_INVALID_META);
                return end;
            }

            if (flags & YP_UNESCAPE_FLAG_META) {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META_REPEAT);
                return backslash + 2;
            }

            if (backslash[2] != '-') {
                if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META);
                return backslash + 2;
            }

            if (backslash[3] == '\\') {
                return unescape(parser, dest, dest_length, backslash + 3, end, flags | YP_UNESCAPE_FLAG_META, error_list);
            }

            if (char_is_ascii_printable(backslash[3])) {
                if (dest) {
                    dest[(*dest_length)++] = unescape_char(backslash[3], flags | YP_UNESCAPE_FLAG_META);
                }
                return backslash + 4;
            }

            if (error_list) yp_diagnostic_list_append(error_list, backslash, backslash + 2, YP_ERR_ESCAPE_INVALID_META);
            return backslash + 3;
        }
        // \n
        case '\n':
            return backslash + 2;
        // \r
        case '\r':
            if (backslash + 2 < end && backslash[2] == '\n') {
                return backslash + 3;
            }
        /* fallthrough */
        // In this case we're escaping something that doesn't need escaping.
        default: {
            size_t width = yp_char_width(parser, backslash + 1, end);

            if (dest) {
                memcpy(dest + *dest_length, backslash + 1, width);
                *dest_length += width;
            }

            return backslash + 1 + width;
        }
    }
}

/******************************************************************************/
/* Public functions and entrypoints                                           */
/******************************************************************************/

// Unescape the contents of the given token into the given string using the
// given unescape mode. The supported escapes are:
//
// \a             bell, ASCII 07h (BEL)
// \b             backspace, ASCII 08h (BS)
// \t             horizontal tab, ASCII 09h (TAB)
// \n             newline (line feed), ASCII 0Ah (LF)
// \v             vertical tab, ASCII 0Bh (VT)
// \f             form feed, ASCII 0Ch (FF)
// \r             carriage return, ASCII 0Dh (CR)
// \e             escape, ASCII 1Bh (ESC)
// \s             space, ASCII 20h (SPC)
// \\             backslash
// \nnn           octal bit pattern, where nnn is 1-3 octal digits ([0-7])
// \xnn           hexadecimal bit pattern, where nn is 1-2 hexadecimal digits ([0-9a-fA-F])
// \unnnn         Unicode character, where nnnn is exactly 4 hexadecimal digits ([0-9a-fA-F])
// \u{nnnn ...}   Unicode character(s), where each nnnn is 1-6 hexadecimal digits ([0-9a-fA-F])
// \cx or \C-x    control character, where x is an ASCII printable character
// \M-x           meta character, where x is an ASCII printable character
// \M-\C-x        meta control character, where x is an ASCII printable character
// \M-\cx         same as above
// \c\M-x         same as above
// \c? or \C-?    delete, ASCII 7Fh (DEL)
//
static void
yp_unescape_manipulate_string_or_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
    if (unescape_type == YP_UNESCAPE_NONE) {
        // If we're not unescaping then we can reference the source directly.
        return;
    }

    const uint8_t *backslash = yp_memchr(string->source, '\\', string->length, parser->encoding_changed, &parser->encoding);

    if (backslash == NULL) {
        // Here there are no escapes, so we can reference the source directly.
        return;
    }

    // Here we have found an escape character, so we need to handle all escapes
    // within the string.
    uint8_t *allocated = malloc(string->length);
    if (allocated == NULL) {
        yp_diagnostic_list_append(&parser->error_list, string->source, string->source + string->length, YP_ERR_MALLOC_FAILED);
        return;
    }

    // This is the memory address where we're putting the unescaped string.
    uint8_t *dest = allocated;
    size_t dest_length = 0;

    // This is the current position in the source string that we're looking at.
    // It's going to move along behind the backslash so that we can copy each
    // segment of the string that doesn't contain an escape.
    const uint8_t *cursor = string->source;
    const uint8_t *end = string->source + string->length;

    // For each escape found in the source string, we will handle it and update
    // the moving cursor->backslash window.
    while (backslash != NULL && backslash + 1 < end) {
        assert(dest_length < string->length);

        // This is the size of the segment of the string from the previous escape
        // or the start of the string to the current escape.
        size_t segment_size = (size_t) (backslash - cursor);

        // Here we're going to copy everything up until the escape into the
        // destination buffer.
        memcpy(dest + dest_length, cursor, segment_size);
        dest_length += segment_size;

        switch (backslash[1]) {
            case '\\':
            case '\'':
                dest[dest_length++] = unescape_chars[backslash[1]];
                cursor = backslash + 2;
                break;
            default:
                if (unescape_type == YP_UNESCAPE_MINIMAL) {
                    // In this case we're escaping something that doesn't need escaping.
                    dest[dest_length++] = '\\';
                    cursor = backslash + 1;
                    break;
                }

                // This is the only type of unescaping left. In this case we need to
                // handle all of the different unescapes.
                assert(unescape_type == YP_UNESCAPE_ALL);

                uint8_t flags = YP_UNESCAPE_FLAG_NONE;
                if (expect_single_codepoint) {
                    flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
                }

                cursor = unescape(parser, dest, &dest_length, backslash, end, flags, &parser->error_list);
                break;
        }

        if (end > cursor) {
            backslash = yp_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, &parser->encoding);
        } else {
            backslash = NULL;
        }
    }

    // We need to copy the final segment of the string after the last escape.
    if (end > cursor) {
        memcpy(dest + dest_length, cursor, (size_t) (end - cursor));
    } else {
        cursor = end;
    }

    // If the string was already allocated, then we need to free that memory
    // here. That's because we're about to override it with the escaped string.
    yp_string_free(string);

    // We also need to update the length at the end. This is because every escape
    // reduces the length of the final string, and we don't want garbage at the
    // end.
    yp_string_owned_init(string, allocated, dest_length + ((size_t) (end - cursor)));
}

YP_EXPORTED_FUNCTION void
yp_unescape_manipulate_string(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
    yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, false);
}

void
yp_unescape_manipulate_char_literal(yp_parser_t *parser, yp_string_t *string, yp_unescape_type_t unescape_type) {
    yp_unescape_manipulate_string_or_char_literal(parser, string, unescape_type, true);
}

// This function is similar to yp_unescape_manipulate_string, except it doesn't
// actually perform any string manipulations. Instead, it calculates how long
// the unescaped character is, and returns that value
size_t
yp_unescape_calculate_difference(yp_parser_t *parser, const uint8_t *backslash, yp_unescape_type_t unescape_type, bool expect_single_codepoint) {
    assert(unescape_type != YP_UNESCAPE_NONE);

    if (backslash + 1 >= parser->end) {
        return 0;
    }

    switch (backslash[1]) {
        case '\\':
        case '\'':
            return 2;
        default: {
            if (unescape_type == YP_UNESCAPE_MINIMAL) {
                return 1 + yp_char_width(parser, backslash + 1, parser->end);
            }

            // This is the only type of unescaping left. In this case we need to
            // handle all of the different unescapes.
            assert(unescape_type == YP_UNESCAPE_ALL);

            uint8_t flags = YP_UNESCAPE_FLAG_NONE;
            if (expect_single_codepoint) {
                flags |= YP_UNESCAPE_FLAG_EXPECT_SINGLE;
            }

            const uint8_t *cursor = unescape(parser, NULL, 0, backslash, parser->end, flags, NULL);
            assert(cursor > backslash);

            return (size_t) (cursor - backslash);
        }
    }
}

// This is one of the main entry points into the extension. It accepts a source
// string, a type of unescaping, and a pointer to a result string. It returns a
// boolean indicating whether or not the unescaping was successful.
YP_EXPORTED_FUNCTION bool
yp_unescape_string(const uint8_t *start, size_t length, yp_unescape_type_t unescape_type, yp_string_t *result) {
    yp_parser_t parser;
    yp_parser_init(&parser, start, length, NULL);

    yp_string_shared_init(result, start, start + length);
    yp_unescape_manipulate_string(&parser, result, unescape_type);

    bool success = yp_list_empty_p(&parser.error_list);
    yp_parser_free(&parser);

    return success;
}