diff options
Diffstat (limited to 'prism/util')
-rw-r--r-- | prism/util/pm_buffer.c | 317 | ||||
-rw-r--r-- | prism/util/pm_buffer.h | 218 | ||||
-rw-r--r-- | prism/util/pm_char.c | 318 | ||||
-rw-r--r-- | prism/util/pm_char.h | 204 | ||||
-rw-r--r-- | prism/util/pm_constant_pool.c | 338 | ||||
-rw-r--r-- | prism/util/pm_constant_pool.h | 218 | ||||
-rw-r--r-- | prism/util/pm_integer.c | 670 | ||||
-rw-r--r-- | prism/util/pm_integer.h | 126 | ||||
-rw-r--r-- | prism/util/pm_list.c | 49 | ||||
-rw-r--r-- | prism/util/pm_list.h | 97 | ||||
-rw-r--r-- | prism/util/pm_memchr.c | 35 | ||||
-rw-r--r-- | prism/util/pm_memchr.h | 29 | ||||
-rw-r--r-- | prism/util/pm_newline_list.c | 96 | ||||
-rw-r--r-- | prism/util/pm_newline_list.h | 102 | ||||
-rw-r--r-- | prism/util/pm_string.c | 320 | ||||
-rw-r--r-- | prism/util/pm_string.h | 166 | ||||
-rw-r--r-- | prism/util/pm_strncasecmp.c | 24 | ||||
-rw-r--r-- | prism/util/pm_strncasecmp.h | 32 | ||||
-rw-r--r-- | prism/util/pm_strpbrk.c | 206 | ||||
-rw-r--r-- | prism/util/pm_strpbrk.h | 46 |
20 files changed, 3611 insertions, 0 deletions
diff --git a/prism/util/pm_buffer.c b/prism/util/pm_buffer.c new file mode 100644 index 0000000000..1e79e46718 --- /dev/null +++ b/prism/util/pm_buffer.c @@ -0,0 +1,317 @@ +#include "prism/util/pm_buffer.h" + +/** + * Return the size of the pm_buffer_t struct. + */ +size_t +pm_buffer_sizeof(void) { + return sizeof(pm_buffer_t); +} + +/** + * Initialize a pm_buffer_t with the given capacity. + */ +bool +pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity) { + buffer->length = 0; + buffer->capacity = capacity; + + buffer->value = (char *) xmalloc(capacity); + return buffer->value != NULL; +} + +/** + * Initialize a pm_buffer_t with its default values. + */ +bool +pm_buffer_init(pm_buffer_t *buffer) { + return pm_buffer_init_capacity(buffer, 1024); +} + +/** + * Return the value of the buffer. + */ +char * +pm_buffer_value(const pm_buffer_t *buffer) { + return buffer->value; +} + +/** + * Return the length of the buffer. + */ +size_t +pm_buffer_length(const pm_buffer_t *buffer) { + return buffer->length; +} + +/** + * Append the given amount of space to the buffer. + */ +static inline bool +pm_buffer_append_length(pm_buffer_t *buffer, size_t length) { + size_t next_length = buffer->length + length; + + if (next_length > buffer->capacity) { + if (buffer->capacity == 0) { + buffer->capacity = 1; + } + + while (next_length > buffer->capacity) { + buffer->capacity *= 2; + } + + buffer->value = xrealloc(buffer->value, buffer->capacity); + if (buffer->value == NULL) return false; + } + + buffer->length = next_length; + return true; +} + +/** + * Append a generic pointer to memory to the buffer. + */ +static inline void +pm_buffer_append(pm_buffer_t *buffer, const void *source, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memcpy(buffer->value + cursor, source, length); + } +} + +/** + * Append the given amount of space as zeroes to the buffer. + */ +void +pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memset(buffer->value + cursor, 0, length); + } +} + +/** + * Append a formatted string to the buffer. + */ +void +pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) { + va_list arguments; + va_start(arguments, format); + int result = vsnprintf(NULL, 0, format, arguments); + va_end(arguments); + + if (result < 0) return; + size_t length = (size_t) (result + 1); + + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + va_start(arguments, format); + vsnprintf(buffer->value + cursor, length, format, arguments); + va_end(arguments); + buffer->length--; + } +} + +/** + * Append a string to the buffer. + */ +void +pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length) { + pm_buffer_append(buffer, value, length); +} + +/** + * Append a list of bytes to the buffer. + */ +void +pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length) { + pm_buffer_append(buffer, (const char *) value, length); +} + +/** + * Append a single byte to the buffer. + */ +void +pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) { + const void *source = &value; + pm_buffer_append(buffer, source, sizeof(uint8_t)); +} + +/** + * Append a 32-bit unsigned integer to the buffer as a variable-length integer. + */ +void +pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) { + if (value < 128) { + pm_buffer_append_byte(buffer, (uint8_t) value); + } else { + uint32_t n = value; + while (n >= 128) { + pm_buffer_append_byte(buffer, (uint8_t) (n | 128)); + n >>= 7; + } + pm_buffer_append_byte(buffer, (uint8_t) n); + } +} + +/** + * Append a 32-bit signed integer to the buffer as a variable-length integer. + */ +void +pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) { + uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31)); + pm_buffer_append_varuint(buffer, unsigned_int); +} + +/** + * Append a double to the buffer. + */ +void +pm_buffer_append_double(pm_buffer_t *buffer, double value) { + const void *source = &value; + pm_buffer_append(buffer, source, sizeof(double)); +} + +/** + * Append a slice of source code to the buffer. + */ +void +pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping) { + for (size_t index = 0; index < length; index++) { + const uint8_t byte = source[index]; + + if ((byte <= 0x06) || (byte >= 0x0E && byte <= 0x1F) || (byte >= 0x7F)) { + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_format(buffer, "\\x%02X", byte); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + } else { + switch (byte) { + case '\a': + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_string(buffer, "\\a", 2); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + break; + case '\b': + pm_buffer_append_string(buffer, "\\b", 2); + break; + case '\t': + pm_buffer_append_string(buffer, "\\t", 2); + break; + case '\n': + pm_buffer_append_string(buffer, "\\n", 2); + break; + case '\v': + if (escaping == PM_BUFFER_ESCAPING_RUBY) { + pm_buffer_append_string(buffer, "\\v", 2); + } else { + pm_buffer_append_format(buffer, "\\u%04X", byte); + } + break; + case '\f': + pm_buffer_append_string(buffer, "\\f", 2); + break; + case '\r': + pm_buffer_append_string(buffer, "\\r", 2); + break; + case '"': + pm_buffer_append_string(buffer, "\\\"", 2); + break; + case '#': { + if (escaping == PM_BUFFER_ESCAPING_RUBY && index + 1 < length) { + const uint8_t next_byte = source[index + 1]; + if (next_byte == '{' || next_byte == '@' || next_byte == '$') { + pm_buffer_append_byte(buffer, '\\'); + } + } + + pm_buffer_append_byte(buffer, '#'); + break; + } + case '\\': + pm_buffer_append_string(buffer, "\\\\", 2); + break; + default: + pm_buffer_append_byte(buffer, byte); + break; + } + } + } +} + +/** + * Prepend the given string to the buffer. + */ +void +pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length) { + size_t cursor = buffer->length; + if (pm_buffer_append_length(buffer, length)) { + memmove(buffer->value + length, buffer->value, cursor); + memcpy(buffer->value, value, length); + } +} + +/** + * Concatenate one buffer onto another. + */ +void +pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source) { + if (source->length > 0) { + pm_buffer_append(destination, source->value, source->length); + } +} + +/** + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + */ +void +pm_buffer_clear(pm_buffer_t *buffer) { + buffer->length = 0; +} + +/** + * Strip the whitespace from the end of the buffer. + */ +void +pm_buffer_rstrip(pm_buffer_t *buffer) { + while (buffer->length > 0 && pm_char_is_whitespace((uint8_t) buffer->value[buffer->length - 1])) { + buffer->length--; + } +} + +/** + * Checks if the buffer includes the given value. + */ +size_t +pm_buffer_index(const pm_buffer_t *buffer, char value) { + const char *first = memchr(buffer->value, value, buffer->length); + return (first == NULL) ? SIZE_MAX : (size_t) (first - buffer->value); +} + +/** + * Insert the given string into the buffer at the given index. + */ +void +pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length) { + assert(index <= buffer->length); + + if (index == buffer->length) { + pm_buffer_append_string(buffer, value, length); + } else { + pm_buffer_append_zeroes(buffer, length); + memmove(buffer->value + index + length, buffer->value + index, buffer->length - length - index); + memcpy(buffer->value + index, value, length); + } +} + +/** + * Free the memory associated with the buffer. + */ +void +pm_buffer_free(pm_buffer_t *buffer) { + xfree(buffer->value); +} diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h new file mode 100644 index 0000000000..58f7b506cf --- /dev/null +++ b/prism/util/pm_buffer.h @@ -0,0 +1,218 @@ +/** + * @file pm_buffer.h + * + * A wrapper around a contiguous block of allocated memory. + */ +#ifndef PRISM_BUFFER_H +#define PRISM_BUFFER_H + +#include "prism/defines.h" +#include "prism/util/pm_char.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +/** + * A pm_buffer_t is a simple memory buffer that stores data in a contiguous + * block of memory. + */ +typedef struct { + /** The length of the buffer in bytes. */ + size_t length; + + /** The capacity of the buffer in bytes that has been allocated. */ + size_t capacity; + + /** A pointer to the start of the buffer. */ + char *value; +} pm_buffer_t; + +/** + * Return the size of the pm_buffer_t struct. + * + * @returns The size of the pm_buffer_t struct. + */ +PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void); + +/** + * Initialize a pm_buffer_t with the given capacity. + * + * @param buffer The buffer to initialize. + * @param capacity The capacity of the buffer. + * @returns True if the buffer was initialized successfully, false otherwise. + */ +bool pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity); + +/** + * Initialize a pm_buffer_t with its default values. + * + * @param buffer The buffer to initialize. + * @returns True if the buffer was initialized successfully, false otherwise. + */ +PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer); + +/** + * Return the value of the buffer. + * + * @param buffer The buffer to get the value of. + * @returns The value of the buffer. + */ +PRISM_EXPORTED_FUNCTION char * pm_buffer_value(const pm_buffer_t *buffer); + +/** + * Return the length of the buffer. + * + * @param buffer The buffer to get the length of. + * @returns The length of the buffer. + */ +PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(const pm_buffer_t *buffer); + +/** + * Append the given amount of space as zeroes to the buffer. + * + * @param buffer The buffer to append to. + * @param length The amount of space to append and zero. + */ +void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length); + +/** + * Append a formatted string to the buffer. + * + * @param buffer The buffer to append to. + * @param format The format string to append. + * @param ... The arguments to the format string. + */ +void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3); + +/** + * Append a string to the buffer. + * + * @param buffer The buffer to append to. + * @param value The string to append. + * @param length The length of the string to append. + */ +void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length); + +/** + * Append a list of bytes to the buffer. + * + * @param buffer The buffer to append to. + * @param value The bytes to append. + * @param length The length of the bytes to append. + */ +void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length); + +/** + * Append a single byte to the buffer. + * + * @param buffer The buffer to append to. + * @param value The byte to append. + */ +void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value); + +/** + * Append a 32-bit unsigned integer to the buffer as a variable-length integer. + * + * @param buffer The buffer to append to. + * @param value The integer to append. + */ +void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value); + +/** + * Append a 32-bit signed integer to the buffer as a variable-length integer. + * + * @param buffer The buffer to append to. + * @param value The integer to append. + */ +void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); + +/** + * Append a double to the buffer. + * + * @param buffer The buffer to append to. + * @param value The double to append. + */ +void pm_buffer_append_double(pm_buffer_t *buffer, double value); + +/** + * The different types of escaping that can be performed by the buffer when + * appending a slice of Ruby source code. + */ +typedef enum { + PM_BUFFER_ESCAPING_RUBY, + PM_BUFFER_ESCAPING_JSON +} pm_buffer_escaping_t; + +/** + * Append a slice of source code to the buffer. + * + * @param buffer The buffer to append to. + * @param source The source code to append. + * @param length The length of the source code to append. + * @param escaping The type of escaping to perform. + */ +void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping); + +/** + * Prepend the given string to the buffer. + * + * @param buffer The buffer to prepend to. + * @param value The string to prepend. + * @param length The length of the string to prepend. + */ +void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length); + +/** + * Concatenate one buffer onto another. + * + * @param destination The buffer to concatenate onto. + * @param source The buffer to concatenate. + */ +void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source); + +/** + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + * + * @param buffer The buffer to clear. + */ +void pm_buffer_clear(pm_buffer_t *buffer); + +/** + * Strip the whitespace from the end of the buffer. + * + * @param buffer The buffer to strip. + */ +void pm_buffer_rstrip(pm_buffer_t *buffer); + +/** + * Checks if the buffer includes the given value. + * + * @param buffer The buffer to check. + * @param value The value to check for. + * @returns The index of the first occurrence of the value in the buffer, or + * SIZE_MAX if the value is not found. + */ +size_t pm_buffer_index(const pm_buffer_t *buffer, char value); + +/** + * Insert the given string into the buffer at the given index. + * + * @param buffer The buffer to insert into. + * @param index The index to insert at. + * @param value The string to insert. + * @param length The length of the string to insert. + */ +void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length); + +/** + * Free the memory associated with the buffer. + * + * @param buffer The buffer to free. + */ +PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer); + +#endif diff --git a/prism/util/pm_char.c b/prism/util/pm_char.c new file mode 100644 index 0000000000..dce19abd1b --- /dev/null +++ b/prism/util/pm_char.c @@ -0,0 +1,318 @@ +#include "prism/util/pm_char.h" + +#define PRISM_CHAR_BIT_WHITESPACE (1 << 0) +#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1) +#define PRISM_CHAR_BIT_REGEXP_OPTION (1 << 2) + +#define PRISM_NUMBER_BIT_BINARY_DIGIT (1 << 0) +#define PRISM_NUMBER_BIT_BINARY_NUMBER (1 << 1) +#define PRISM_NUMBER_BIT_OCTAL_DIGIT (1 << 2) +#define PRISM_NUMBER_BIT_OCTAL_NUMBER (1 << 3) +#define PRISM_NUMBER_BIT_DECIMAL_DIGIT (1 << 4) +#define PRISM_NUMBER_BIT_DECIMAL_NUMBER (1 << 5) +#define PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT (1 << 6) +#define PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER (1 << 7) + +static const uint8_t pm_byte_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 4x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 5x + 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 6x + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx +}; + +static const uint8_t pm_number_table[256] = { + // 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 2x + 0xff, 0xff, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 3x + 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 4x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, // 5x + 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 6x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 7x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 8x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 9x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ax + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Bx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Cx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Dx + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ex + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Fx +}; + +/** + * Returns the number of characters at the start of the string that match the + * given kind. Disallows searching past the given maximum number of characters. + */ +static inline size_t +pm_strspn_char_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_byte_table[string[size]] & kind)) size++; + return size; +} + +/** + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_WHITESPACE); +} + +/** + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + */ +size_t +pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_WHITESPACE)) { + if (string[size] == '\n') { + pm_newline_list_append(newline_list, string + size); + } + + size++; + } + + return size; +} + +/** + * Returns the number of characters at the start of the string that are inline + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_INLINE_WHITESPACE); +} + +/** + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_REGEXP_OPTION); +} + +/** + * Returns true if the given character matches the given kind. + */ +static inline bool +pm_char_is_char_kind(const uint8_t b, uint8_t kind) { + return (pm_byte_table[b] & kind) != 0; +} + +/** + * Returns true if the given character is a whitespace character. + */ +bool +pm_char_is_whitespace(const uint8_t b) { + return pm_char_is_char_kind(b, PRISM_CHAR_BIT_WHITESPACE); +} + +/** + * Returns true if the given character is an inline whitespace character. + */ +bool +pm_char_is_inline_whitespace(const uint8_t b) { + return pm_char_is_char_kind(b, PRISM_CHAR_BIT_INLINE_WHITESPACE); +} + +/** + * Scan through the string and return the number of characters at the start of + * the string that match the given kind. Disallows searching past the given + * maximum number of characters. + */ +static inline size_t +pm_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + while (size < maximum && (pm_number_table[string[size]] & kind)) size++; + return size; +} + +/** + * Scan through the string and return the number of characters at the start of + * the string that match the given kind. Disallows searching past the given + * maximum number of characters. + * + * Additionally, report the location of the last invalid underscore character + * found in the string through the out invalid parameter. + */ +static inline size_t +pm_strspn_number_kind_underscores(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid, uint8_t kind) { + if (length <= 0) return 0; + + size_t size = 0; + size_t maximum = (size_t) length; + + bool underscore = false; + while (size < maximum && (pm_number_table[string[size]] & kind)) { + if (string[size] == '_') { + if (underscore) *invalid = string + size; + underscore = true; + } else { + underscore = false; + } + + size++; + } + + if (string[size - 1] == '_') *invalid = string + size - 1; + return size; +} + +/** + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_BINARY_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_OCTAL_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + */ +size_t +pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_DECIMAL_DIGIT); +} + +/** + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore + */ +size_t +pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_DECIMAL_NUMBER); +} + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + */ +size_t +pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length) { + return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT); +} + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t +pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) { + return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER); +} + +/** + * Returns true if the given character matches the given kind. + */ +static inline bool +pm_char_is_number_kind(const uint8_t b, uint8_t kind) { + return (pm_number_table[b] & kind) != 0; +} + +/** + * Returns true if the given character is a binary digit. + */ +bool +pm_char_is_binary_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_BINARY_DIGIT); +} + +/** + * Returns true if the given character is an octal digit. + */ +bool +pm_char_is_octal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_OCTAL_DIGIT); +} + +/** + * Returns true if the given character is a decimal digit. + */ +bool +pm_char_is_decimal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_DECIMAL_DIGIT); +} + +/** + * Returns true if the given character is a hexadecimal digit. + */ +bool +pm_char_is_hexadecimal_digit(const uint8_t b) { + return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT); +} + +#undef PRISM_CHAR_BIT_WHITESPACE +#undef PRISM_CHAR_BIT_INLINE_WHITESPACE +#undef PRISM_CHAR_BIT_REGEXP_OPTION + +#undef PRISM_NUMBER_BIT_BINARY_DIGIT +#undef PRISM_NUMBER_BIT_BINARY_NUMBER +#undef PRISM_NUMBER_BIT_OCTAL_DIGIT +#undef PRISM_NUMBER_BIT_OCTAL_NUMBER +#undef PRISM_NUMBER_BIT_DECIMAL_DIGIT +#undef PRISM_NUMBER_BIT_DECIMAL_NUMBER +#undef PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER +#undef PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT diff --git a/prism/util/pm_char.h b/prism/util/pm_char.h new file mode 100644 index 0000000000..deeafd6321 --- /dev/null +++ b/prism/util/pm_char.h @@ -0,0 +1,204 @@ +/** + * @file pm_char.h + * + * Functions for working with characters and strings. + */ +#ifndef PRISM_CHAR_H +#define PRISM_CHAR_H + +#include "prism/defines.h" +#include "prism/util/pm_newline_list.h" + +#include <stdbool.h> +#include <stddef.h> + +/** + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are + * whitespace. + */ +size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param newline_list The list of newlines to populate. + * @return The number of characters at the start of the string that are + * whitespace. + */ +size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list); + +/** + * Returns the number of characters at the start of the string that are inline + * whitespace. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are inline + * whitespace. + */ +size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are decimal + * digits. + */ +size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are + * hexadecimal digits. + */ +size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are octal + * digits or underscores. + */ +size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are decimal + * digits or underscores. + */ +size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are + * hexadecimal digits or underscores. + */ +size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @return The number of characters at the start of the string that are regexp + * options. + */ +size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length); + +/** + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + * + * @param string The string to search. + * @param length The maximum number of characters to search. + * @param invalid The pointer to set to the index of the first invalid + * underscore. + * @return The number of characters at the start of the string that are binary + * digits or underscores. + */ +size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/** + * Returns true if the given character is a whitespace character. + * + * @param b The character to check. + * @return True if the given character is a whitespace character. + */ +bool pm_char_is_whitespace(const uint8_t b); + +/** + * Returns true if the given character is an inline whitespace character. + * + * @param b The character to check. + * @return True if the given character is an inline whitespace character. + */ +bool pm_char_is_inline_whitespace(const uint8_t b); + +/** + * Returns true if the given character is a binary digit. + * + * @param b The character to check. + * @return True if the given character is a binary digit. + */ +bool pm_char_is_binary_digit(const uint8_t b); + +/** + * Returns true if the given character is an octal digit. + * + * @param b The character to check. + * @return True if the given character is an octal digit. + */ +bool pm_char_is_octal_digit(const uint8_t b); + +/** + * Returns true if the given character is a decimal digit. + * + * @param b The character to check. + * @return True if the given character is a decimal digit. + */ +bool pm_char_is_decimal_digit(const uint8_t b); + +/** + * Returns true if the given character is a hexadecimal digit. + * + * @param b The character to check. + * @return True if the given character is a hexadecimal digit. + */ +bool pm_char_is_hexadecimal_digit(const uint8_t b); + +#endif diff --git a/prism/util/pm_constant_pool.c b/prism/util/pm_constant_pool.c new file mode 100644 index 0000000000..624002cec9 --- /dev/null +++ b/prism/util/pm_constant_pool.c @@ -0,0 +1,338 @@ +#include "prism/util/pm_constant_pool.h" + +/** + * Initialize a list of constant ids. + */ +void +pm_constant_id_list_init(pm_constant_id_list_t *list) { + list->ids = NULL; + list->size = 0; + list->capacity = 0; +} + +/** + * Initialize a list of constant ids with a given capacity. + */ +void +pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity) { + list->ids = xcalloc(capacity, sizeof(pm_constant_id_t)); + if (list->ids == NULL) abort(); + + list->size = 0; + list->capacity = capacity; +} + +/** + * Append a constant id to a list of constant ids. Returns false if any + * potential reallocations fail. + */ +bool +pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id) { + if (list->size >= list->capacity) { + list->capacity = list->capacity == 0 ? 8 : list->capacity * 2; + list->ids = (pm_constant_id_t *) xrealloc(list->ids, sizeof(pm_constant_id_t) * list->capacity); + if (list->ids == NULL) return false; + } + + list->ids[list->size++] = id; + return true; +} + +/** + * Insert a constant id into a list of constant ids at the specified index. + */ +void +pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id) { + assert(index < list->capacity); + assert(list->ids[index] == PM_CONSTANT_ID_UNSET); + + list->ids[index] = id; + list->size++; +} + +/** + * Checks if the current constant id list includes the given constant id. + */ +bool +pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id) { + for (size_t index = 0; index < list->size; index++) { + if (list->ids[index] == id) return true; + } + return false; +} + +/** + * Free the memory associated with a list of constant ids. + */ +void +pm_constant_id_list_free(pm_constant_id_list_t *list) { + if (list->ids != NULL) { + xfree(list->ids); + } +} + +/** + * A relatively simple hash function (djb2) that is used to hash strings. We are + * optimizing here for simplicity and speed. + */ +static inline uint32_t +pm_constant_pool_hash(const uint8_t *start, size_t length) { + // This is a prime number used as the initial value for the hash function. + uint32_t value = 5381; + + for (size_t index = 0; index < length; index++) { + value = ((value << 5) + value) + start[index]; + } + + return value; +} + +/** + * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 + */ +static uint32_t +next_power_of_two(uint32_t v) { + // Avoid underflow in subtraction on next line. + if (v == 0) { + // 1 is the nearest power of 2 to 0 (2^0) + return 1; + } + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v++; + return v; +} + +#ifndef NDEBUG +static bool +is_power_of_two(uint32_t size) { + return (size & (size - 1)) == 0; +} +#endif + +/** + * Resize a constant pool to a given capacity. + */ +static inline bool +pm_constant_pool_resize(pm_constant_pool_t *pool) { + assert(is_power_of_two(pool->capacity)); + + uint32_t next_capacity = pool->capacity * 2; + if (next_capacity < pool->capacity) return false; + + const uint32_t mask = next_capacity - 1; + const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); + + void *next = xcalloc(next_capacity, element_size); + if (next == NULL) return false; + + pm_constant_pool_bucket_t *next_buckets = next; + pm_constant_t *next_constants = (void *)(((char *) next) + next_capacity * sizeof(pm_constant_pool_bucket_t)); + + // For each bucket in the current constant pool, find the index in the + // next constant pool, and insert it. + for (uint32_t index = 0; index < pool->capacity; index++) { + pm_constant_pool_bucket_t *bucket = &pool->buckets[index]; + + // If an id is set on this constant, then we know we have content here. + // In this case we need to insert it into the next constant pool. + if (bucket->id != PM_CONSTANT_ID_UNSET) { + uint32_t next_index = bucket->hash & mask; + + // This implements linear scanning to find the next available slot + // in case this index is already taken. We don't need to bother + // comparing the values since we know that the hash is unique. + while (next_buckets[next_index].id != PM_CONSTANT_ID_UNSET) { + next_index = (next_index + 1) & mask; + } + + // Here we copy over the entire bucket, which includes the id so + // that they are consistent between resizes. + next_buckets[next_index] = *bucket; + } + } + + // The constants are stable with respect to hash table resizes. + memcpy(next_constants, pool->constants, pool->size * sizeof(pm_constant_t)); + + // pool->constants and pool->buckets are allocated out of the same chunk + // of memory, with the buckets coming first. + xfree(pool->buckets); + pool->constants = next_constants; + pool->buckets = next_buckets; + pool->capacity = next_capacity; + return true; +} + +/** + * Initialize a new constant pool with a given capacity. + */ +bool +pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity) { + const uint32_t maximum = (~((uint32_t) 0)); + if (capacity >= ((maximum / 2) + 1)) return false; + + capacity = next_power_of_two(capacity); + const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); + void *memory = xcalloc(capacity, element_size); + if (memory == NULL) return false; + + pool->buckets = memory; + pool->constants = (void *)(((char *)memory) + capacity * sizeof(pm_constant_pool_bucket_t)); + pool->size = 0; + pool->capacity = capacity; + return true; +} + +/** + * Return a pointer to the constant indicated by the given constant id. + */ +pm_constant_t * +pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id) { + assert(constant_id != PM_CONSTANT_ID_UNSET && constant_id <= pool->size); + return &pool->constants[constant_id - 1]; +} + +/** + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + */ +pm_constant_id_t +pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + assert(is_power_of_two(pool->capacity)); + const uint32_t mask = pool->capacity - 1; + + uint32_t hash = pm_constant_pool_hash(start, length); + uint32_t index = hash & mask; + pm_constant_pool_bucket_t *bucket; + + while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) { + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + return bucket->id; + } + + index = (index + 1) & mask; + } + + return PM_CONSTANT_ID_UNSET; +} + +/** + * Insert a constant into a constant pool and return its index in the pool. + */ +static inline pm_constant_id_t +pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t length, pm_constant_pool_bucket_type_t type) { + if (pool->size >= (pool->capacity / 4 * 3)) { + if (!pm_constant_pool_resize(pool)) return PM_CONSTANT_ID_UNSET; + } + + assert(is_power_of_two(pool->capacity)); + const uint32_t mask = pool->capacity - 1; + + uint32_t hash = pm_constant_pool_hash(start, length); + uint32_t index = hash & mask; + pm_constant_pool_bucket_t *bucket; + + while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) { + // If there is a collision, then we need to check if the content is the + // same as the content we are trying to insert. If it is, then we can + // return the id of the existing constant. + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + + if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + // Since we have found a match, we need to check if this is + // attempting to insert a shared or an owned constant. We want to + // prefer shared constants since they don't require allocations. + if (type == PM_CONSTANT_POOL_BUCKET_OWNED) { + // If we're attempting to insert an owned constant and we have + // an existing constant, then either way we don't want the given + // memory. Either it's duplicated with the existing constant or + // it's not necessary because we have a shared version. + xfree((void *) start); + } else if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { + // If we're attempting to insert a shared constant and the + // existing constant is owned, then we can free the owned + // constant and replace it with the shared constant. + xfree((void *) constant->start); + constant->start = start; + bucket->type = (unsigned int) (PM_CONSTANT_POOL_BUCKET_DEFAULT & 0x3); + } + + return bucket->id; + } + + index = (index + 1) & mask; + } + + // IDs are allocated starting at 1, since the value 0 denotes a non-existent + // constant. + uint32_t id = ++pool->size; + assert(pool->size < ((uint32_t) (1 << 30))); + + *bucket = (pm_constant_pool_bucket_t) { + .id = (unsigned int) (id & 0x3fffffff), + .type = (unsigned int) (type & 0x3), + .hash = hash + }; + + pool->constants[id - 1] = (pm_constant_t) { + .start = start, + .length = length, + }; + + return id; +} + +/** + * Insert a constant into a constant pool. Returns the id of the constant, or + * PM_CONSTANT_ID_UNSET if any potential calls to resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_DEFAULT); +} + +/** + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or PM_CONSTANT_ID_UNSET if any + * potential calls to resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_OWNED); +} + +/** + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or PM_CONSTANT_ID_UNSET if any potential calls to + * resize fail. + */ +pm_constant_id_t +pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_CONSTANT); +} + +/** + * Free the memory associated with a constant pool. + */ +void +pm_constant_pool_free(pm_constant_pool_t *pool) { + // For each constant in the current constant pool, free the contents if the + // contents are owned. + for (uint32_t index = 0; index < pool->capacity; index++) { + pm_constant_pool_bucket_t *bucket = &pool->buckets[index]; + + // If an id is set on this constant, then we know we have content here. + if (bucket->id != PM_CONSTANT_ID_UNSET && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { + pm_constant_t *constant = &pool->constants[bucket->id - 1]; + xfree((void *) constant->start); + } + } + + xfree(pool->buckets); +} diff --git a/prism/util/pm_constant_pool.h b/prism/util/pm_constant_pool.h new file mode 100644 index 0000000000..6df23f8f50 --- /dev/null +++ b/prism/util/pm_constant_pool.h @@ -0,0 +1,218 @@ +/** + * @file pm_constant_pool.h + * + * A data structure that stores a set of strings. + * + * Each string is assigned a unique id, which can be used to compare strings for + * equality. This comparison ends up being much faster than strcmp, since it + * only requires a single integer comparison. + */ +#ifndef PRISM_CONSTANT_POOL_H +#define PRISM_CONSTANT_POOL_H + +#include "prism/defines.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> + +/** + * When we allocate constants into the pool, we reserve 0 to mean that the slot + * is not yet filled. This constant is reused in other places to indicate the + * lack of a constant id. + */ +#define PM_CONSTANT_ID_UNSET 0 + +/** + * A constant id is a unique identifier for a constant in the constant pool. + */ +typedef uint32_t pm_constant_id_t; + +/** + * A list of constant IDs. Usually used to represent a set of locals. + */ +typedef struct { + /** The number of constant ids in the list. */ + size_t size; + + /** The number of constant ids that have been allocated in the list. */ + size_t capacity; + + /** The constant ids in the list. */ + pm_constant_id_t *ids; +} pm_constant_id_list_t; + +/** + * Initialize a list of constant ids. + * + * @param list The list to initialize. + */ +void pm_constant_id_list_init(pm_constant_id_list_t *list); + +/** + * Initialize a list of constant ids with a given capacity. + * + * @param list The list to initialize. + * @param capacity The initial capacity of the list. + */ +void pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity); + +/** + * Append a constant id to a list of constant ids. Returns false if any + * potential reallocations fail. + * + * @param list The list to append to. + * @param id The id to append. + * @return Whether the append succeeded. + */ +bool pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id); + +/** + * Insert a constant id into a list of constant ids at the specified index. + * + * @param list The list to insert into. + * @param index The index at which to insert. + * @param id The id to insert. + */ +void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id); + +/** + * Checks if the current constant id list includes the given constant id. + * + * @param list The list to check. + * @param id The id to check for. + * @return Whether the list includes the given id. + */ +bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id); + +/** + * Free the memory associated with a list of constant ids. + * + * @param list The list to free. + */ +void pm_constant_id_list_free(pm_constant_id_list_t *list); + +/** + * The type of bucket in the constant pool hash map. This determines how the + * bucket should be freed. + */ +typedef unsigned int pm_constant_pool_bucket_type_t; + +/** By default, each constant is a slice of the source. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0; + +/** An owned constant is one for which memory has been allocated. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1; + +/** A constant constant is known at compile time. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2; + +/** A bucket in the hash map. */ +typedef struct { + /** The incremental ID used for indexing back into the pool. */ + unsigned int id: 30; + + /** The type of the bucket, which determines how to free it. */ + pm_constant_pool_bucket_type_t type: 2; + + /** The hash of the bucket. */ + uint32_t hash; +} pm_constant_pool_bucket_t; + +/** A constant in the pool which effectively stores a string. */ +typedef struct { + /** A pointer to the start of the string. */ + const uint8_t *start; + + /** The length of the string. */ + size_t length; +} pm_constant_t; + +/** The overall constant pool, which stores constants found while parsing. */ +typedef struct { + /** The buckets in the hash map. */ + pm_constant_pool_bucket_t *buckets; + + /** The constants that are stored in the buckets. */ + pm_constant_t *constants; + + /** The number of buckets in the hash map. */ + uint32_t size; + + /** The number of buckets that have been allocated in the hash map. */ + uint32_t capacity; +} pm_constant_pool_t; + +/** + * Initialize a new constant pool with a given capacity. + * + * @param pool The pool to initialize. + * @param capacity The initial capacity of the pool. + * @return Whether the initialization succeeded. + */ +bool pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity); + +/** + * Return a pointer to the constant indicated by the given constant id. + * + * @param pool The pool to get the constant from. + * @param constant_id The id of the constant to get. + * @return A pointer to the constant. + */ +pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id); + +/** + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + * + * @param pool The pool to find the constant in. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool that is a slice of a source string. + * Returns the id of the constant, or 0 if any potential calls to resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or 0 if any potential calls to + * resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length); + +/** + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or 0 if any potential calls to resize fail. + * + * @param pool The pool to insert the constant into. + * @param start A pointer to the start of the constant. + * @param length The length of the constant. + * @return The id of the constant. + */ +pm_constant_id_t pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/** + * Free the memory associated with a constant pool. + * + * @param pool The pool to free. + */ +void pm_constant_pool_free(pm_constant_pool_t *pool); + +#endif diff --git a/prism/util/pm_integer.c b/prism/util/pm_integer.c new file mode 100644 index 0000000000..5b0f34465c --- /dev/null +++ b/prism/util/pm_integer.c @@ -0,0 +1,670 @@ +#include "prism/util/pm_integer.h" + +/** + * Pull out the length and values from the integer, regardless of the form in + * which the length/values are stored. + */ +#define INTEGER_EXTRACT(integer, length_variable, values_variable) \ + if ((integer)->values == NULL) { \ + length_variable = 1; \ + values_variable = &(integer)->value; \ + } else { \ + length_variable = (integer)->length; \ + values_variable = (integer)->values; \ + } + +/** + * Adds two positive pm_integer_t with the given base. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +big_add(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) { + size_t left_length; + uint32_t *left_values; + INTEGER_EXTRACT(left, left_length, left_values) + + size_t right_length; + uint32_t *right_values; + INTEGER_EXTRACT(right, right_length, right_values) + + size_t length = left_length < right_length ? right_length : left_length; + uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * (length + 1)); + if (values == NULL) return; + + uint64_t carry = 0; + for (size_t index = 0; index < length; index++) { + uint64_t sum = carry + (index < left_length ? left_values[index] : 0) + (index < right_length ? right_values[index] : 0); + values[index] = (uint32_t) (sum % base); + carry = sum / base; + } + + if (carry > 0) { + values[length] = (uint32_t) carry; + length++; + } + + *destination = (pm_integer_t) { 0, length, values, false }; +} + +/** + * Internal use for karatsuba_multiply. Calculates `a - b - c` with the given + * base. Assume a, b, c, a - b - c all to be positive. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +big_sub2(pm_integer_t *destination, pm_integer_t *a, pm_integer_t *b, pm_integer_t *c, uint64_t base) { + size_t a_length; + uint32_t *a_values; + INTEGER_EXTRACT(a, a_length, a_values) + + size_t b_length; + uint32_t *b_values; + INTEGER_EXTRACT(b, b_length, b_values) + + size_t c_length; + uint32_t *c_values; + INTEGER_EXTRACT(c, c_length, c_values) + + uint32_t *values = (uint32_t*) xmalloc(sizeof(uint32_t) * a_length); + int64_t carry = 0; + + for (size_t index = 0; index < a_length; index++) { + int64_t sub = ( + carry + + a_values[index] - + (index < b_length ? b_values[index] : 0) - + (index < c_length ? c_values[index] : 0) + ); + + if (sub >= 0) { + values[index] = (uint32_t) sub; + carry = 0; + } else { + sub += 2 * (int64_t) base; + values[index] = (uint32_t) ((uint64_t) sub % base); + carry = sub / (int64_t) base - 2; + } + } + + while (a_length > 1 && values[a_length - 1] == 0) a_length--; + *destination = (pm_integer_t) { 0, a_length, values, false }; +} + +/** + * Multiply two positive integers with the given base using karatsuba algorithm. + * Return pm_integer_t with values allocated. Not normalized. + */ +static void +karatsuba_multiply(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) { + size_t left_length; + uint32_t *left_values; + INTEGER_EXTRACT(left, left_length, left_values) + + size_t right_length; + uint32_t *right_values; + INTEGER_EXTRACT(right, right_length, right_values) + + if (left_length > right_length) { + size_t temporary_length = left_length; + left_length = right_length; + right_length = temporary_length; + + uint32_t *temporary_values = left_values; + left_values = right_values; + right_values = temporary_values; + } + + if (left_length <= 10) { + size_t length = left_length + right_length; + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + if (values == NULL) return; + + for (size_t left_index = 0; left_index < left_length; left_index++) { + uint32_t carry = 0; + for (size_t right_index = 0; right_index < right_length; right_index++) { + uint64_t product = (uint64_t) left_values[left_index] * right_values[right_index] + values[left_index + right_index] + carry; + values[left_index + right_index] = (uint32_t) (product % base); + carry = (uint32_t) (product / base); + } + values[left_index + right_length] = carry; + } + + while (length > 1 && values[length - 1] == 0) length--; + *destination = (pm_integer_t) { 0, length, values, false }; + return; + } + + if (left_length * 2 <= right_length) { + uint32_t *values = (uint32_t *) xcalloc(left_length + right_length, sizeof(uint32_t)); + + for (size_t start_offset = 0; start_offset < right_length; start_offset += left_length) { + size_t end_offset = start_offset + left_length; + if (end_offset > right_length) end_offset = right_length; + + pm_integer_t sliced_left = { + .value = 0, + .length = left_length, + .values = left_values, + .negative = false + }; + + pm_integer_t sliced_right = { + .value = 0, + .length = end_offset - start_offset, + .values = right_values + start_offset, + .negative = false + }; + + pm_integer_t product; + karatsuba_multiply(&product, &sliced_left, &sliced_right, base); + + uint32_t carry = 0; + for (size_t index = 0; index < product.length; index++) { + uint64_t sum = (uint64_t) values[start_offset + index] + product.values[index] + carry; + values[start_offset + index] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + if (carry > 0) values[start_offset + product.length] += carry; + pm_integer_free(&product); + } + + *destination = (pm_integer_t) { 0, left_length + right_length, values, false }; + return; + } + + size_t half = left_length / 2; + pm_integer_t x0 = { 0, half, left_values, false }; + pm_integer_t x1 = { 0, left_length - half, left_values + half, false }; + pm_integer_t y0 = { 0, half, right_values, false }; + pm_integer_t y1 = { 0, right_length - half, right_values + half, false }; + + pm_integer_t z0 = { 0 }; + karatsuba_multiply(&z0, &x0, &y0, base); + + pm_integer_t z2 = { 0 }; + karatsuba_multiply(&z2, &x1, &y1, base); + + // For simplicity to avoid considering negative values, + // use `z1 = (x0 + x1) * (y0 + y1) - z0 - z2` instead of original karatsuba algorithm. + pm_integer_t x01 = { 0 }; + big_add(&x01, &x0, &x1, base); + + pm_integer_t y01 = { 0 }; + big_add(&y01, &y0, &y1, base); + + pm_integer_t xy = { 0 }; + karatsuba_multiply(&xy, &x01, &y01, base); + + pm_integer_t z1; + big_sub2(&z1, &xy, &z0, &z2, base); + + size_t length = left_length + right_length; + uint32_t *values = (uint32_t*) xcalloc(length, sizeof(uint32_t)); + + assert(z0.values != NULL); + memcpy(values, z0.values, sizeof(uint32_t) * z0.length); + + assert(z2.values != NULL); + memcpy(values + 2 * half, z2.values, sizeof(uint32_t) * z2.length); + + uint32_t carry = 0; + for(size_t index = 0; index < z1.length; index++) { + uint64_t sum = (uint64_t) carry + values[index + half] + z1.values[index]; + values[index + half] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + for(size_t index = half + z1.length; carry > 0; index++) { + uint64_t sum = (uint64_t) carry + values[index]; + values[index] = (uint32_t) (sum % base); + carry = (uint32_t) (sum / base); + } + + while (length > 1 && values[length - 1] == 0) length--; + pm_integer_free(&z0); + pm_integer_free(&z1); + pm_integer_free(&z2); + pm_integer_free(&x01); + pm_integer_free(&y01); + pm_integer_free(&xy); + + *destination = (pm_integer_t) { 0, length, values, false }; +} + +/** + * The values of a hexadecimal digit, where the index is the ASCII character. + * Note that there's an odd exception here where _ is mapped to 0. This is + * because it's possible for us to end up trying to parse a number that has + * already had an error attached to it, and we want to provide _something_ to + * the user. + */ +static const int8_t pm_integer_parse_digit_values[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, // 5x + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Fx +}; + +/** + * Return the value of a hexadecimal digit in a uint8_t. + */ +static uint8_t +pm_integer_parse_digit(const uint8_t character) { + int8_t value = pm_integer_parse_digit_values[character]; + assert(value != -1 && "invalid digit"); + + return (uint8_t) value; +} + +/** + * Create a pm_integer_t from uint64_t with the given base. It is assumed that + * the memory for the pm_integer_t pointer has been zeroed. + */ +static void +pm_integer_from_uint64(pm_integer_t *integer, uint64_t value, uint64_t base) { + if (value < base) { + integer->value = (uint32_t) value; + return; + } + + size_t length = 0; + uint64_t length_value = value; + while (length_value > 0) { + length++; + length_value /= base; + } + + uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * length); + if (values == NULL) return; + + for (size_t value_index = 0; value_index < length; value_index++) { + values[value_index] = (uint32_t) (value % base); + value /= base; + } + + integer->length = length; + integer->values = values; +} + +/** + * Normalize pm_integer_t. + * Heading zero values will be removed. If the integer fits into uint32_t, + * values is set to NULL, length is set to 0, and value field will be used. + */ +static void +pm_integer_normalize(pm_integer_t *integer) { + if (integer->values == NULL) { + return; + } + + while (integer->length > 1 && integer->values[integer->length - 1] == 0) { + integer->length--; + } + + if (integer->length > 1) { + return; + } + + uint32_t value = integer->values[0]; + bool negative = integer->negative && value != 0; + + pm_integer_free(integer); + *integer = (pm_integer_t) { .value = value, .length = 0, .values = NULL, .negative = negative }; +} + +/** + * Convert base of the integer. + * In practice, it converts 10**9 to 1<<32 or 1<<32 to 10**9. + */ +static void +pm_integer_convert_base(pm_integer_t *destination, const pm_integer_t *source, uint64_t base_from, uint64_t base_to) { + size_t source_length; + const uint32_t *source_values; + INTEGER_EXTRACT(source, source_length, source_values) + + size_t bigints_length = (source_length + 1) / 2; + assert(bigints_length > 0); + + pm_integer_t *bigints = (pm_integer_t *) xcalloc(bigints_length, sizeof(pm_integer_t)); + if (bigints == NULL) return; + + for (size_t index = 0; index < source_length; index += 2) { + uint64_t value = source_values[index] + base_from * (index + 1 < source_length ? source_values[index + 1] : 0); + pm_integer_from_uint64(&bigints[index / 2], value, base_to); + } + + pm_integer_t base = { 0 }; + pm_integer_from_uint64(&base, base_from, base_to); + + while (bigints_length > 1) { + pm_integer_t next_base; + karatsuba_multiply(&next_base, &base, &base, base_to); + + pm_integer_free(&base); + base = next_base; + + size_t next_length = (bigints_length + 1) / 2; + pm_integer_t *next_bigints = (pm_integer_t *) xcalloc(next_length, sizeof(pm_integer_t)); + + for (size_t bigints_index = 0; bigints_index < bigints_length; bigints_index += 2) { + if (bigints_index + 1 == bigints_length) { + next_bigints[bigints_index / 2] = bigints[bigints_index]; + } else { + pm_integer_t multiplied = { 0 }; + karatsuba_multiply(&multiplied, &base, &bigints[bigints_index + 1], base_to); + + big_add(&next_bigints[bigints_index / 2], &bigints[bigints_index], &multiplied, base_to); + pm_integer_free(&bigints[bigints_index]); + pm_integer_free(&bigints[bigints_index + 1]); + pm_integer_free(&multiplied); + } + } + + xfree(bigints); + bigints = next_bigints; + bigints_length = next_length; + } + + *destination = bigints[0]; + destination->negative = source->negative; + pm_integer_normalize(destination); + + xfree(bigints); + pm_integer_free(&base); +} + +#undef INTEGER_EXTRACT + +/** + * Convert digits to integer with the given power-of-two base. + */ +static void +pm_integer_parse_powof2(pm_integer_t *integer, uint32_t base, const uint8_t *digits, size_t digits_length) { + size_t bit = 1; + while (base > (uint32_t) (1 << bit)) bit++; + + size_t length = (digits_length * bit + 31) / 32; + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + + for (size_t digit_index = 0; digit_index < digits_length; digit_index++) { + size_t bit_position = bit * (digits_length - digit_index - 1); + uint32_t value = digits[digit_index]; + + size_t index = bit_position / 32; + size_t shift = bit_position % 32; + + values[index] |= value << shift; + if (32 - shift < bit) values[index + 1] |= value >> (32 - shift); + } + + while (length > 1 && values[length - 1] == 0) length--; + *integer = (pm_integer_t) { .value = 0, .length = length, .values = values, .negative = false }; + pm_integer_normalize(integer); +} + +/** + * Convert decimal digits to pm_integer_t. + */ +static void +pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t digits_length) { + const size_t batch = 9; + size_t length = (digits_length + batch - 1) / batch; + + uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); + uint32_t value = 0; + + for (size_t digits_index = 0; digits_index < digits_length; digits_index++) { + value = value * 10 + digits[digits_index]; + + size_t reverse_index = digits_length - digits_index - 1; + if (reverse_index % batch == 0) { + values[reverse_index / batch] = value; + value = 0; + } + } + + // Convert base from 10**9 to 1<<32. + pm_integer_convert_base(integer, &((pm_integer_t) { .value = 0, .length = length, .values = values, .negative = false }), 1000000000, ((uint64_t) 1 << 32)); + xfree(values); +} + +/** + * Parse a large integer from a string that does not fit into uint32_t. + */ +static void +pm_integer_parse_big(pm_integer_t *integer, uint32_t multiplier, const uint8_t *start, const uint8_t *end) { + // Allocate an array to store digits. + uint8_t *digits = xmalloc(sizeof(uint8_t) * (size_t) (end - start)); + size_t digits_length = 0; + + for (; start < end; start++) { + if (*start == '_') continue; + digits[digits_length++] = pm_integer_parse_digit(*start); + } + + // Construct pm_integer_t from the digits. + if (multiplier == 10) { + pm_integer_parse_decimal(integer, digits, digits_length); + } else { + pm_integer_parse_powof2(integer, multiplier, digits, digits_length); + } + + xfree(digits); +} + +/** + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + */ +void +pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end) { + // Ignore unary +. Unary - is parsed differently and will not end up here. + // Instead, it will modify the parsed integer later. + if (*start == '+') start++; + + // Determine the multiplier from the base, and skip past any prefixes. + uint32_t multiplier = 10; + switch (base) { + case PM_INTEGER_BASE_DEFAULT: + while (*start == '0') start++; // 01 -> 1 + break; + case PM_INTEGER_BASE_BINARY: + start += 2; // 0b + multiplier = 2; + break; + case PM_INTEGER_BASE_OCTAL: + start++; // 0 + if (*start == '_' || *start == 'o' || *start == 'O') start++; // o + multiplier = 8; + break; + case PM_INTEGER_BASE_DECIMAL: + if (*start == '0' && (end - start) > 1) start += 2; // 0d + break; + case PM_INTEGER_BASE_HEXADECIMAL: + start += 2; // 0x + multiplier = 16; + break; + case PM_INTEGER_BASE_UNKNOWN: + if (*start == '0' && (end - start) > 1) { + switch (start[1]) { + case '_': start += 2; multiplier = 8; break; + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': start++; multiplier = 8; break; + case 'b': case 'B': start += 2; multiplier = 2; break; + case 'o': case 'O': start += 2; multiplier = 8; break; + case 'd': case 'D': start += 2; break; + case 'x': case 'X': start += 2; multiplier = 16; break; + default: assert(false && "unreachable"); break; + } + } + break; + } + + // It's possible that we've consumed everything at this point if there is an + // invalid integer. If this is the case, we'll just return 0. + if (start >= end) return; + + const uint8_t *cursor = start; + uint64_t value = (uint64_t) pm_integer_parse_digit(*cursor++); + + for (; cursor < end; cursor++) { + if (*cursor == '_') continue; + value = value * multiplier + (uint64_t) pm_integer_parse_digit(*cursor); + + if (value > UINT32_MAX) { + // If the integer is too large to fit into a single uint32_t, then + // we'll parse it as a big integer. + pm_integer_parse_big(integer, multiplier, start, end); + return; + } + } + + integer->value = (uint32_t) value; +} + +/** + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + */ +int +pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right) { + if (left->negative != right->negative) return left->negative ? -1 : 1; + int negative = left->negative ? -1 : 1; + + if (left->values == NULL && right->values == NULL) { + if (left->value < right->value) return -1 * negative; + if (left->value > right->value) return 1 * negative; + return 0; + } + + if (left->values == NULL || left->length < right->length) return -1 * negative; + if (right->values == NULL || left->length > right->length) return 1 * negative; + + for (size_t index = 0; index < left->length; index++) { + size_t value_index = left->length - index - 1; + uint32_t left_value = left->values[value_index]; + uint32_t right_value = right->values[value_index]; + + if (left_value < right_value) return -1 * negative; + if (left_value > right_value) return 1 * negative; + } + + return 0; +} + +/** + * Reduce a ratio of integers to its simplest form. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator) { + // If either the numerator or denominator do not fit into a 32-bit integer, + // then this function is a no-op. In the future, we may consider reducing + // even the larger numbers, but for now we're going to keep it simple. + if ( + // If the numerator doesn't fit into a 32-bit integer, return early. + numerator->length != 0 || + // If the denominator doesn't fit into a 32-bit integer, return early. + denominator->length != 0 || + // If the numerator is 0, then return early. + numerator->value == 0 || + // If the denominator is 1, then return early. + denominator->value == 1 + ) return; + + // Find the greatest common divisor of the numerator and denominator. + uint32_t divisor = numerator->value; + uint32_t remainder = denominator->value; + + while (remainder != 0) { + uint32_t temporary = remainder; + remainder = divisor % remainder; + divisor = temporary; + } + + // Divide the numerator and denominator by the greatest common divisor. + numerator->value /= divisor; + denominator->value /= divisor; +} + +/** + * Convert an integer to a decimal string. + */ +PRISM_EXPORTED_FUNCTION void +pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) { + if (integer->negative) { + pm_buffer_append_byte(buffer, '-'); + } + + // If the integer fits into a single uint32_t, then we can just append the + // value directly to the buffer. + if (integer->values == NULL) { + pm_buffer_append_format(buffer, "%" PRIu32, integer->value); + return; + } + + // If the integer is two uint32_t values, then we can | them together and + // append the result to the buffer. + if (integer->length == 2) { + const uint64_t value = ((uint64_t) integer->values[0]) | ((uint64_t) integer->values[1] << 32); + pm_buffer_append_format(buffer, "%" PRIu64, value); + return; + } + + // Otherwise, first we'll convert the base from 1<<32 to 10**9. + pm_integer_t converted = { 0 }; + pm_integer_convert_base(&converted, integer, (uint64_t) 1 << 32, 1000000000); + + if (converted.values == NULL) { + pm_buffer_append_format(buffer, "%" PRIu32, converted.value); + pm_integer_free(&converted); + return; + } + + // Allocate a buffer that we'll copy the decimal digits into. + size_t digits_length = converted.length * 9; + char *digits = xcalloc(digits_length, sizeof(char)); + if (digits == NULL) return; + + // Pack bigdecimal to digits. + for (size_t value_index = 0; value_index < converted.length; value_index++) { + uint32_t value = converted.values[value_index]; + + for (size_t digit_index = 0; digit_index < 9; digit_index++) { + digits[digits_length - 9 * value_index - digit_index - 1] = (char) ('0' + value % 10); + value /= 10; + } + } + + size_t start_offset = 0; + while (start_offset < digits_length - 1 && digits[start_offset] == '0') start_offset++; + + // Finally, append the string to the buffer and free the digits. + pm_buffer_append_string(buffer, digits + start_offset, digits_length - start_offset); + xfree(digits); + pm_integer_free(&converted); +} + +/** + * Free the internal memory of an integer. This memory will only be allocated if + * the integer exceeds the size of a single uint32_t. + */ +PRISM_EXPORTED_FUNCTION void +pm_integer_free(pm_integer_t *integer) { + if (integer->values) { + xfree(integer->values); + } +} diff --git a/prism/util/pm_integer.h b/prism/util/pm_integer.h new file mode 100644 index 0000000000..91b28ad2f3 --- /dev/null +++ b/prism/util/pm_integer.h @@ -0,0 +1,126 @@ +/** + * @file pm_integer.h + * + * This module provides functions for working with arbitrary-sized integers. + */ +#ifndef PRISM_NUMBER_H +#define PRISM_NUMBER_H + +#include "prism/defines.h" +#include "prism/util/pm_buffer.h" + +#include <assert.h> +#include <stdbool.h> +#include <stdint.h> +#include <stdlib.h> + +/** + * A structure represents an arbitrary-sized integer. + */ +typedef struct { + /** + * Embedded value for small integer. This value is set to 0 if the value + * does not fit into uint32_t. + */ + uint32_t value; + + /** + * The number of allocated values. length is set to 0 if the integer fits + * into uint32_t. + */ + size_t length; + + /** + * List of 32-bit integers. Set to NULL if the integer fits into uint32_t. + */ + uint32_t *values; + + /** + * Whether or not the integer is negative. It is stored this way so that a + * zeroed pm_integer_t is always positive zero. + */ + bool negative; +} pm_integer_t; + +/** + * An enum controlling the base of an integer. It is expected that the base is + * already known before parsing the integer, even though it could be derived + * from the string itself. + */ +typedef enum { + /** The default decimal base, with no prefix. Leading 0s will be ignored. */ + PM_INTEGER_BASE_DEFAULT, + + /** The binary base, indicated by a 0b or 0B prefix. */ + PM_INTEGER_BASE_BINARY, + + /** The octal base, indicated by a 0, 0o, or 0O prefix. */ + PM_INTEGER_BASE_OCTAL, + + /** The decimal base, indicated by a 0d, 0D, or empty prefix. */ + PM_INTEGER_BASE_DECIMAL, + + /** The hexadecimal base, indicated by a 0x or 0X prefix. */ + PM_INTEGER_BASE_HEXADECIMAL, + + /** + * An unknown base, in which case pm_integer_parse will derive it based on + * the content of the string. This is less efficient and does more + * comparisons, so if callers know the base ahead of time, they should use + * that instead. + */ + PM_INTEGER_BASE_UNKNOWN +} pm_integer_base_t; + +/** + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + * + * @param integer The integer to parse into. + * @param base The base of the integer. + * @param start The start of the string. + * @param end The end of the string. + */ +void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end); + +/** + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + * + * @param left The left integer to compare. + * @param right The right integer to compare. + * @return The result of the comparison. + */ +int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right); + +/** + * Reduce a ratio of integers to its simplest form. + * + * If either the numerator or denominator do not fit into a 32-bit integer, then + * this function is a no-op. In the future, we may consider reducing even the + * larger numbers, but for now we're going to keep it simple. + * + * @param numerator The numerator of the ratio. + * @param denominator The denominator of the ratio. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator); + +/** + * Convert an integer to a decimal string. + * + * @param buffer The buffer to append the string to. + * @param integer The integer to convert to a string. + */ +PRISM_EXPORTED_FUNCTION void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer); + +/** + * Free the internal memory of an integer. This memory will only be allocated if + * the integer exceeds the size of a single node in the linked list. + * + * @param integer The integer to free. + */ +PRISM_EXPORTED_FUNCTION void pm_integer_free(pm_integer_t *integer); + +#endif diff --git a/prism/util/pm_list.c b/prism/util/pm_list.c new file mode 100644 index 0000000000..ad2294cd60 --- /dev/null +++ b/prism/util/pm_list.c @@ -0,0 +1,49 @@ +#include "prism/util/pm_list.h" + +/** + * Returns true if the given list is empty. + */ +PRISM_EXPORTED_FUNCTION bool +pm_list_empty_p(pm_list_t *list) { + return list->head == NULL; +} + +/** + * Returns the size of the list. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_list_size(pm_list_t *list) { + return list->size; +} + +/** + * Append a node to the given list. + */ +void +pm_list_append(pm_list_t *list, pm_list_node_t *node) { + if (list->head == NULL) { + list->head = node; + } else { + list->tail->next = node; + } + + list->tail = node; + list->size++; +} + +/** + * Deallocate the internal state of the given list. + */ +PRISM_EXPORTED_FUNCTION void +pm_list_free(pm_list_t *list) { + pm_list_node_t *node = list->head; + pm_list_node_t *next; + + while (node != NULL) { + next = node->next; + xfree(node); + node = next; + } + + list->size = 0; +} diff --git a/prism/util/pm_list.h b/prism/util/pm_list.h new file mode 100644 index 0000000000..3512dee979 --- /dev/null +++ b/prism/util/pm_list.h @@ -0,0 +1,97 @@ +/** + * @file pm_list.h + * + * An abstract linked list. + */ +#ifndef PRISM_LIST_H +#define PRISM_LIST_H + +#include "prism/defines.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdlib.h> + +/** + * This struct represents an abstract linked list that provides common + * functionality. It is meant to be used any time a linked list is necessary to + * store data. + * + * The linked list itself operates off a set of pointers. Because the pointers + * are not necessarily sequential, they can be of any size. We use this fact to + * allow the consumer of this linked list to extend the node struct to include + * any data they want. This is done by using the pm_list_node_t as the first + * member of the struct. + * + * For example, if we want to store a list of integers, we can do the following: + * + * ```c + * typedef struct { + * pm_list_node_t node; + * int value; + * } pm_int_node_t; + * + * pm_list_t list = { 0 }; + * pm_int_node_t *node = xmalloc(sizeof(pm_int_node_t)); + * node->value = 5; + * + * pm_list_append(&list, &node->node); + * ``` + * + * The pm_list_t struct is used to represent the overall linked list. It + * contains a pointer to the head and tail of the list. This allows for easy + * iteration and appending of new nodes. + */ +typedef struct pm_list_node { + /** A pointer to the next node in the list. */ + struct pm_list_node *next; +} pm_list_node_t; + +/** + * This represents the overall linked list. It keeps a pointer to the head and + * tail so that iteration is easy and pushing new nodes is easy. + */ +typedef struct { + /** The size of the list. */ + size_t size; + + /** A pointer to the head of the list. */ + pm_list_node_t *head; + + /** A pointer to the tail of the list. */ + pm_list_node_t *tail; +} pm_list_t; + +/** + * Returns true if the given list is empty. + * + * @param list The list to check. + * @return True if the given list is empty, otherwise false. + */ +PRISM_EXPORTED_FUNCTION bool pm_list_empty_p(pm_list_t *list); + +/** + * Returns the size of the list. + * + * @param list The list to check. + * @return The size of the list. + */ +PRISM_EXPORTED_FUNCTION size_t pm_list_size(pm_list_t *list); + +/** + * Append a node to the given list. + * + * @param list The list to append to. + * @param node The node to append. + */ +void pm_list_append(pm_list_t *list, pm_list_node_t *node); + +/** + * Deallocate the internal state of the given list. + * + * @param list The list to free. + */ +PRISM_EXPORTED_FUNCTION void pm_list_free(pm_list_t *list); + +#endif diff --git a/prism/util/pm_memchr.c b/prism/util/pm_memchr.c new file mode 100644 index 0000000000..7ea20ace6d --- /dev/null +++ b/prism/util/pm_memchr.c @@ -0,0 +1,35 @@ +#include "prism/util/pm_memchr.h" + +#define PRISM_MEMCHR_TRAILING_BYTE_MINIMUM 0x40 + +/** + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + */ +void * +pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) { + if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) { + const uint8_t *source = (const uint8_t *) memory; + size_t index = 0; + + while (index < number) { + if (source[index] == character) { + return (void *) (source + index); + } + + size_t width = encoding->char_width(source + index, (ptrdiff_t) (number - index)); + if (width == 0) { + return NULL; + } + + index += width; + } + + return NULL; + } else { + return memchr(memory, character, number); + } +} + +#undef PRISM_MEMCHR_TRAILING_BYTE_MINIMUM diff --git a/prism/util/pm_memchr.h b/prism/util/pm_memchr.h new file mode 100644 index 0000000000..e0671eaed3 --- /dev/null +++ b/prism/util/pm_memchr.h @@ -0,0 +1,29 @@ +/** + * @file pm_memchr.h + * + * A custom memchr implementation. + */ +#ifndef PRISM_MEMCHR_H +#define PRISM_MEMCHR_H + +#include "prism/defines.h" +#include "prism/encoding.h" + +#include <stddef.h> + +/** + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + * + * @param source The source string. + * @param character The character to search for. + * @param number The maximum number of bytes to search. + * @param encoding_changed Whether the encoding changed. + * @param encoding A pointer to the encoding. + * @return A pointer to the first occurrence of the character in the source + * string, or NULL if no such character exists. + */ +void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding); + +#endif diff --git a/prism/util/pm_newline_list.c b/prism/util/pm_newline_list.c new file mode 100644 index 0000000000..ce07ce8c8e --- /dev/null +++ b/prism/util/pm_newline_list.c @@ -0,0 +1,96 @@ +#include "prism/util/pm_newline_list.h" + +/** + * Initialize a new newline list with the given capacity. Returns true if the + * allocation of the offsets succeeds, otherwise returns false. + */ +bool +pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) { + list->offsets = (size_t *) xcalloc(capacity, sizeof(size_t)); + if (list->offsets == NULL) return false; + + list->start = start; + + // This is 1 instead of 0 because we want to include the first line of the + // file as having offset 0, which is set because of calloc. + list->size = 1; + list->capacity = capacity; + + return true; +} + +/** + * Clear out the newlines that have been appended to the list. + */ +void +pm_newline_list_clear(pm_newline_list_t *list) { + list->size = 1; +} + +/** + * Append a new offset to the newline list. Returns true if the reallocation of + * the offsets succeeds (if one was necessary), otherwise returns false. + */ +bool +pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) { + if (list->size == list->capacity) { + size_t *original_offsets = list->offsets; + + list->capacity = (list->capacity * 3) / 2; + list->offsets = (size_t *) xcalloc(list->capacity, sizeof(size_t)); + if (list->offsets == NULL) return false; + + memcpy(list->offsets, original_offsets, list->size * sizeof(size_t)); + xfree(original_offsets); + } + + assert(*cursor == '\n'); + assert(cursor >= list->start); + size_t newline_offset = (size_t) (cursor - list->start + 1); + + assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]); + list->offsets[list->size++] = newline_offset; + + return true; +} + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + */ +pm_line_column_t +pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) { + assert(cursor >= list->start); + size_t offset = (size_t) (cursor - list->start); + + size_t left = 0; + size_t right = list->size - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + + if (list->offsets[mid] == offset) { + return ((pm_line_column_t) { ((int32_t) mid) + start_line, 0 }); + } + + if (list->offsets[mid] < offset) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + return ((pm_line_column_t) { + .line = ((int32_t) left) + start_line - 1, + .column = (uint32_t) (offset - list->offsets[left - 1]) + }); +} + +/** + * Free the internal memory allocated for the newline list. + */ +void +pm_newline_list_free(pm_newline_list_t *list) { + xfree(list->offsets); +} diff --git a/prism/util/pm_newline_list.h b/prism/util/pm_newline_list.h new file mode 100644 index 0000000000..7ae9b6b3da --- /dev/null +++ b/prism/util/pm_newline_list.h @@ -0,0 +1,102 @@ +/** + * @file pm_newline_list.h + * + * A list of byte offsets of newlines in a string. + * + * When compiling the syntax tree, it's necessary to know the line and column + * of many nodes. This is necessary to support things like error messages, + * tracepoints, etc. + * + * It's possible that we could store the start line, start column, end line, and + * end column on every node in addition to the offsets that we already store, + * but that would be quite a lot of memory overhead. + */ +#ifndef PRISM_NEWLINE_LIST_H +#define PRISM_NEWLINE_LIST_H + +#include "prism/defines.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> + +/** + * A list of offsets of newlines in a string. The offsets are assumed to be + * sorted/inserted in ascending order. + */ +typedef struct { + /** A pointer to the start of the source string. */ + const uint8_t *start; + + /** The number of offsets in the list. */ + size_t size; + + /** The capacity of the list that has been allocated. */ + size_t capacity; + + /** The list of offsets. */ + size_t *offsets; +} pm_newline_list_t; + +/** + * A line and column in a string. + */ +typedef struct { + /** The line number. */ + int32_t line; + + /** The column number. */ + uint32_t column; +} pm_line_column_t; + +/** + * Initialize a new newline list with the given capacity. Returns true if the + * allocation of the offsets succeeds, otherwise returns false. + * + * @param list The list to initialize. + * @param start A pointer to the start of the source string. + * @param capacity The initial capacity of the list. + * @return True if the allocation of the offsets succeeds, otherwise false. + */ +bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity); + +/** + * Clear out the newlines that have been appended to the list. + * + * @param list The list to clear. + */ +void +pm_newline_list_clear(pm_newline_list_t *list); + +/** + * Append a new offset to the newline list. Returns true if the reallocation of + * the offsets succeeds (if one was necessary), otherwise returns false. + * + * @param list The list to append to. + * @param cursor A pointer to the offset to append. + * @return True if the reallocation of the offsets succeeds (if one was + * necessary), otherwise false. + */ +bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor); + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + * + * @param list The list to search. + * @param cursor A pointer to the offset to search for. + * @param start_line The line to start counting from. + * @return The line and column of the given offset. + */ +pm_line_column_t pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line); + +/** + * Free the internal memory allocated for the newline list. + * + * @param list The list to free. + */ +void pm_newline_list_free(pm_newline_list_t *list); + +#endif diff --git a/prism/util/pm_string.c b/prism/util/pm_string.c new file mode 100644 index 0000000000..dfc121b6a2 --- /dev/null +++ b/prism/util/pm_string.c @@ -0,0 +1,320 @@ +#include "prism/util/pm_string.h" + +/** + * Returns the size of the pm_string_t struct. This is necessary to allocate the + * correct amount of memory in the FFI backend. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_string_sizeof(void) { + return sizeof(pm_string_t); +} + +/** + * Initialize a shared string that is based on initial input. + */ +void +pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end) { + assert(start <= end); + + *string = (pm_string_t) { + .type = PM_STRING_SHARED, + .source = start, + .length = (size_t) (end - start) + }; +} + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + */ +void +pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_OWNED, + .source = source, + .length = length + }; +} + +/** + * Initialize a constant string that doesn't own its memory source. + */ +void +pm_string_constant_init(pm_string_t *string, const char *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_CONSTANT, + .source = (const uint8_t *) source, + .length = length + }; +} + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * We want to use demand paging as much as possible in order to avoid having to + * read the entire file into memory (which could be detrimental to performance + * for large files). This means that if we're on windows we'll use + * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use + * `mmap`, and on other POSIX systems we'll use `read`. + */ +PRISM_EXPORTED_FUNCTION bool +pm_string_mapped_init(pm_string_t *string, const char *filepath) { +#ifdef _WIN32 + // Open the file for reading. + HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + + if (file == INVALID_HANDLE_VALUE) { + return false; + } + + // Get the file size. + DWORD file_size = GetFileSize(file, NULL); + if (file_size == INVALID_FILE_SIZE) { + CloseHandle(file); + return false; + } + + // If the file is empty, then we don't need to do anything else, we'll set + // the source to a constant empty string and return. + if (file_size == 0) { + CloseHandle(file); + const uint8_t source[] = ""; + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; + return true; + } + + // Create a mapping of the file. + HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL); + if (mapping == NULL) { + CloseHandle(file); + return false; + } + + // Map the file into memory. + uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(mapping); + CloseHandle(file); + + if (source == NULL) { + return false; + } + + *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = (size_t) file_size }; + return true; +#elif defined(_POSIX_MAPPED_FILES) + // Open the file for reading + int fd = open(filepath, O_RDONLY); + if (fd == -1) { + return false; + } + + // Stat the file to get the file size + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + return false; + } + + // mmap the file descriptor to virtually get the contents + size_t size = (size_t) sb.st_size; + uint8_t *source = NULL; + + if (size == 0) { + close(fd); + const uint8_t source[] = ""; + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; + return true; + } + + source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (source == MAP_FAILED) { + return false; + } + + close(fd); + *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = size }; + return true; +#else + (void) string; + (void) filepath; + perror("pm_string_mapped_init is not implemented for this platform"); + return false; +#endif +} + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + */ +PRISM_EXPORTED_FUNCTION bool +pm_string_file_init(pm_string_t *string, const char *filepath) { +#ifdef _WIN32 + // Open the file for reading. + HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); + + if (file == INVALID_HANDLE_VALUE) { + return false; + } + + // Get the file size. + DWORD file_size = GetFileSize(file, NULL); + if (file_size == INVALID_FILE_SIZE) { + CloseHandle(file); + return false; + } + + // If the file is empty, then we don't need to do anything else, we'll set + // the source to a constant empty string and return. + if (file_size == 0) { + CloseHandle(file); + const uint8_t source[] = ""; + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; + return true; + } + + // Create a buffer to read the file into. + uint8_t *source = xmalloc(file_size); + if (source == NULL) { + CloseHandle(file); + return false; + } + + // Read the contents of the file + DWORD bytes_read; + if (!ReadFile(file, source, file_size, &bytes_read, NULL)) { + CloseHandle(file); + return false; + } + + // Check the number of bytes read + if (bytes_read != file_size) { + xfree(source); + CloseHandle(file); + return false; + } + + CloseHandle(file); + *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size }; + return true; +#elif defined(_POSIX_MAPPED_FILES) + FILE *file = fopen(filepath, "rb"); + if (file == NULL) { + return false; + } + + fseek(file, 0, SEEK_END); + long file_size = ftell(file); + + if (file_size == -1) { + fclose(file); + return false; + } + + if (file_size == 0) { + fclose(file); + const uint8_t source[] = ""; + *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; + return true; + } + + size_t length = (size_t) file_size; + uint8_t *source = xmalloc(length); + if (source == NULL) { + fclose(file); + return false; + } + + fseek(file, 0, SEEK_SET); + size_t bytes_read = fread(source, length, 1, file); + fclose(file); + + if (bytes_read != 1) { + xfree(source); + return false; + } + + *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = length }; + return true; +#else + (void) string; + (void) filepath; + perror("pm_string_file_init is not implemented for this platform"); + return false; +#endif +} + +/** + * Ensure the string is owned. If it is not, then reinitialize it as owned and + * copy over the previous source. + */ +void +pm_string_ensure_owned(pm_string_t *string) { + if (string->type == PM_STRING_OWNED) return; + + size_t length = pm_string_length(string); + const uint8_t *source = pm_string_source(string); + + uint8_t *memory = xmalloc(length); + if (!memory) return; + + pm_string_owned_init(string, memory, length); + memcpy((void *) string->source, source, length); +} + +/** + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + */ +int +pm_string_compare(const pm_string_t *left, const pm_string_t *right) { + size_t left_length = pm_string_length(left); + size_t right_length = pm_string_length(right); + + if (left_length < right_length) { + return -1; + } else if (left_length > right_length) { + return 1; + } + + return memcmp(pm_string_source(left), pm_string_source(right), left_length); +} + +/** + * Returns the length associated with the string. + */ +PRISM_EXPORTED_FUNCTION size_t +pm_string_length(const pm_string_t *string) { + return string->length; +} + +/** + * Returns the start pointer associated with the string. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * +pm_string_source(const pm_string_t *string) { + return string->source; +} + +/** + * Free the associated memory of the given string. + */ +PRISM_EXPORTED_FUNCTION void +pm_string_free(pm_string_t *string) { + void *memory = (void *) string->source; + + if (string->type == PM_STRING_OWNED) { + xfree(memory); +#ifdef PRISM_HAS_MMAP + } else if (string->type == PM_STRING_MAPPED && string->length) { +#if defined(_WIN32) + UnmapViewOfFile(memory); +#elif defined(_POSIX_MAPPED_FILES) + munmap(memory, string->length); +#endif +#endif /* PRISM_HAS_MMAP */ + } +} diff --git a/prism/util/pm_string.h b/prism/util/pm_string.h new file mode 100644 index 0000000000..d23792c0ba --- /dev/null +++ b/prism/util/pm_string.h @@ -0,0 +1,166 @@ +/** + * @file pm_string.h + * + * A generic string type that can have various ownership semantics. + */ +#ifndef PRISM_STRING_H +#define PRISM_STRING_H + +#include "prism/defines.h" + +#include <assert.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +// The following headers are necessary to read files using demand paging. +#ifdef _WIN32 +#include <windows.h> +#elif defined(_POSIX_MAPPED_FILES) +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#endif + +/** + * A generic string type that can have various ownership semantics. + */ +typedef struct { + /** A pointer to the start of the string. */ + const uint8_t *source; + + /** The length of the string in bytes of memory. */ + size_t length; + + /** The type of the string. This field determines how the string should be freed. */ + enum { + /** This string is a constant string, and should not be freed. */ + PM_STRING_CONSTANT, + + /** This is a slice of another string, and should not be freed. */ + PM_STRING_SHARED, + + /** This string owns its memory, and should be freed using `pm_string_free`. */ + PM_STRING_OWNED, + +#ifdef PRISM_HAS_MMAP + /** This string is a memory-mapped file, and should be freed using `pm_string_free`. */ + PM_STRING_MAPPED +#endif + } type; +} pm_string_t; + +/** + * Returns the size of the pm_string_t struct. This is necessary to allocate the + * correct amount of memory in the FFI backend. + * + * @return The size of the pm_string_t struct. + */ +PRISM_EXPORTED_FUNCTION size_t pm_string_sizeof(void); + +/** + * Defines an empty string. This is useful for initializing a string that will + * be filled in later. + */ +#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 }) + +/** + * Initialize a shared string that is based on initial input. + * + * @param string The string to initialize. + * @param start The start of the string. + * @param end The end of the string. + */ +void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end); + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length); + +/** + * Initialize a constant string that doesn't own its memory source. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +void pm_string_constant_init(pm_string_t *string, const char *source, size_t length); + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * We want to use demand paging as much as possible in order to avoid having to + * read the entire file into memory (which could be detrimental to performance + * for large files). This means that if we're on windows we'll use + * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use + * `mmap`, and on other POSIX systems we'll use `read`. + * + * @param string The string to initialize. + * @param filepath The filepath to read. + * @return Whether or not the file was successfully mapped. + */ +PRISM_EXPORTED_FUNCTION bool pm_string_mapped_init(pm_string_t *string, const char *filepath); + +/** + * Read the file indicated by the filepath parameter into source and load its + * contents and size into the given `pm_string_t`. The given `pm_string_t` + * should be freed using `pm_string_free` when it is no longer used. + * + * @param string The string to initialize. + * @param filepath The filepath to read. + * @return Whether or not the file was successfully read. + */ +PRISM_EXPORTED_FUNCTION bool pm_string_file_init(pm_string_t *string, const char *filepath); + +/** + * Ensure the string is owned. If it is not, then reinitialize it as owned and + * copy over the previous source. + * + * @param string The string to ensure is owned. + */ +void pm_string_ensure_owned(pm_string_t *string); + +/** + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + * + * @param left The left string to compare. + * @param right The right string to compare. + * @return The comparison result. + */ +int pm_string_compare(const pm_string_t *left, const pm_string_t *right); + +/** + * Returns the length associated with the string. + * + * @param string The string to get the length of. + * @return The length of the string. + */ +PRISM_EXPORTED_FUNCTION size_t pm_string_length(const pm_string_t *string); + +/** + * Returns the start pointer associated with the string. + * + * @param string The string to get the start pointer of. + * @return The start pointer of the string. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_string_source(const pm_string_t *string); + +/** + * Free the associated memory of the given string. + * + * @param string The string to free. + */ +PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string); + +#endif diff --git a/prism/util/pm_strncasecmp.c b/prism/util/pm_strncasecmp.c new file mode 100644 index 0000000000..2240bf8110 --- /dev/null +++ b/prism/util/pm_strncasecmp.c @@ -0,0 +1,24 @@ +#include "prism/util/pm_strncasecmp.h" + +/** + * Compare two strings, ignoring case, up to the given length. Returns 0 if the + * strings are equal, a negative number if string1 is less than string2, or a + * positive number if string1 is greater than string2. + * + * Note that this is effectively our own implementation of strncasecmp, but it's + * not available on all of the platforms we want to support so we're rolling it + * here. + */ +int +pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length) { + size_t offset = 0; + int difference = 0; + + while (offset < length && string1[offset] != '\0') { + if (string2[offset] == '\0') return string1[offset]; + if ((difference = tolower(string1[offset]) - tolower(string2[offset])) != 0) return difference; + offset++; + } + + return difference; +} diff --git a/prism/util/pm_strncasecmp.h b/prism/util/pm_strncasecmp.h new file mode 100644 index 0000000000..5cb88cb5eb --- /dev/null +++ b/prism/util/pm_strncasecmp.h @@ -0,0 +1,32 @@ +/** + * @file pm_strncasecmp.h + * + * A custom strncasecmp implementation. + */ +#ifndef PRISM_STRNCASECMP_H +#define PRISM_STRNCASECMP_H + +#include "prism/defines.h" + +#include <ctype.h> +#include <stddef.h> +#include <stdint.h> + +/** + * Compare two strings, ignoring case, up to the given length. Returns 0 if the + * strings are equal, a negative number if string1 is less than string2, or a + * positive number if string1 is greater than string2. + * + * Note that this is effectively our own implementation of strncasecmp, but it's + * not available on all of the platforms we want to support so we're rolling it + * here. + * + * @param string1 The first string to compare. + * @param string2 The second string to compare + * @param length The maximum number of characters to compare. + * @return 0 if the strings are equal, a negative number if string1 is less than + * string2, or a positive number if string1 is greater than string2. + */ +int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length); + +#endif diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c new file mode 100644 index 0000000000..916a4cc3fd --- /dev/null +++ b/prism/util/pm_strpbrk.c @@ -0,0 +1,206 @@ +#include "prism/util/pm_strpbrk.h" + +/** + * Add an invalid multibyte character error to the parser. + */ +static inline void +pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start); +} + +/** + * Set the explicit encoding for the parser to the current encoding. + */ +static inline void +pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == parser->encoding) { + // Okay, we already locked to this encoding. + } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // Not okay, we already found a Unicode escape sequence and this + // conflicts. + pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } else { + // Should not be anything else. + assert(false && "unreachable"); + } + } + + parser->explicit_encoding = parser->encoding; +} + +/** + * This is the default path. + */ +static inline const uint8_t * +pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * This is the path when the encoding is ASCII-8BIT. + */ +static inline const uint8_t * +pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1); + index++; + } + + return NULL; +} + +/** + * This is the slow path that does care about the encoding. + */ +static inline const uint8_t * +pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * This is the fast path that does not care about the encoding because we know + * the encoding only supports single-byte characters. + */ +static inline const uint8_t * +pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { + size_t index = 0; + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80 || !validate) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + pm_strpbrk_explicit_encoding_set(parser, source, width); + + if (width > 0) { + index += width; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); + } + } + } + + return NULL; +} + +/** + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift_JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + */ +const uint8_t * +pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) { + if (length <= 0) { + return NULL; + } else if (!parser->encoding_changed) { + return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate); + } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate); + } else if (parser->encoding->multibyte) { + return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate); + } else { + return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate); + } +} diff --git a/prism/util/pm_strpbrk.h b/prism/util/pm_strpbrk.h new file mode 100644 index 0000000000..f387bd5782 --- /dev/null +++ b/prism/util/pm_strpbrk.h @@ -0,0 +1,46 @@ +/** + * @file pm_strpbrk.h + * + * A custom strpbrk implementation. + */ +#ifndef PRISM_STRPBRK_H +#define PRISM_STRPBRK_H + +#include "prism/defines.h" +#include "prism/diagnostic.h" +#include "prism/parser.h" + +#include <stddef.h> +#include <string.h> + +/** + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift-JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + * + * @param parser The parser. + * @param source The source to search. + * @param charset The charset to search for. + * @param length The maximum number of bytes to search. + * @param validate Whether to validate that the source string is valid in the + * current encoding of the parser. + * @return A pointer to the first character in the source string that is in the + * charset, or NULL if no such character exists. + */ +const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate); + +#endif |