summaryrefslogtreecommitdiff
path: root/prism/util
diff options
context:
space:
mode:
Diffstat (limited to 'prism/util')
-rw-r--r--prism/util/pm_buffer.c317
-rw-r--r--prism/util/pm_buffer.h218
-rw-r--r--prism/util/pm_char.c318
-rw-r--r--prism/util/pm_char.h204
-rw-r--r--prism/util/pm_constant_pool.c338
-rw-r--r--prism/util/pm_constant_pool.h218
-rw-r--r--prism/util/pm_integer.c670
-rw-r--r--prism/util/pm_integer.h126
-rw-r--r--prism/util/pm_list.c49
-rw-r--r--prism/util/pm_list.h97
-rw-r--r--prism/util/pm_memchr.c35
-rw-r--r--prism/util/pm_memchr.h29
-rw-r--r--prism/util/pm_newline_list.c96
-rw-r--r--prism/util/pm_newline_list.h102
-rw-r--r--prism/util/pm_string.c320
-rw-r--r--prism/util/pm_string.h166
-rw-r--r--prism/util/pm_strncasecmp.c24
-rw-r--r--prism/util/pm_strncasecmp.h32
-rw-r--r--prism/util/pm_strpbrk.c206
-rw-r--r--prism/util/pm_strpbrk.h46
20 files changed, 3611 insertions, 0 deletions
diff --git a/prism/util/pm_buffer.c b/prism/util/pm_buffer.c
new file mode 100644
index 0000000000..1e79e46718
--- /dev/null
+++ b/prism/util/pm_buffer.c
@@ -0,0 +1,317 @@
+#include "prism/util/pm_buffer.h"
+
+/**
+ * Return the size of the pm_buffer_t struct.
+ */
+size_t
+pm_buffer_sizeof(void) {
+ return sizeof(pm_buffer_t);
+}
+
+/**
+ * Initialize a pm_buffer_t with the given capacity.
+ */
+bool
+pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity) {
+ buffer->length = 0;
+ buffer->capacity = capacity;
+
+ buffer->value = (char *) xmalloc(capacity);
+ return buffer->value != NULL;
+}
+
+/**
+ * Initialize a pm_buffer_t with its default values.
+ */
+bool
+pm_buffer_init(pm_buffer_t *buffer) {
+ return pm_buffer_init_capacity(buffer, 1024);
+}
+
+/**
+ * Return the value of the buffer.
+ */
+char *
+pm_buffer_value(const pm_buffer_t *buffer) {
+ return buffer->value;
+}
+
+/**
+ * Return the length of the buffer.
+ */
+size_t
+pm_buffer_length(const pm_buffer_t *buffer) {
+ return buffer->length;
+}
+
+/**
+ * Append the given amount of space to the buffer.
+ */
+static inline bool
+pm_buffer_append_length(pm_buffer_t *buffer, size_t length) {
+ size_t next_length = buffer->length + length;
+
+ if (next_length > buffer->capacity) {
+ if (buffer->capacity == 0) {
+ buffer->capacity = 1;
+ }
+
+ while (next_length > buffer->capacity) {
+ buffer->capacity *= 2;
+ }
+
+ buffer->value = xrealloc(buffer->value, buffer->capacity);
+ if (buffer->value == NULL) return false;
+ }
+
+ buffer->length = next_length;
+ return true;
+}
+
+/**
+ * Append a generic pointer to memory to the buffer.
+ */
+static inline void
+pm_buffer_append(pm_buffer_t *buffer, const void *source, size_t length) {
+ size_t cursor = buffer->length;
+ if (pm_buffer_append_length(buffer, length)) {
+ memcpy(buffer->value + cursor, source, length);
+ }
+}
+
+/**
+ * Append the given amount of space as zeroes to the buffer.
+ */
+void
+pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length) {
+ size_t cursor = buffer->length;
+ if (pm_buffer_append_length(buffer, length)) {
+ memset(buffer->value + cursor, 0, length);
+ }
+}
+
+/**
+ * Append a formatted string to the buffer.
+ */
+void
+pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) {
+ va_list arguments;
+ va_start(arguments, format);
+ int result = vsnprintf(NULL, 0, format, arguments);
+ va_end(arguments);
+
+ if (result < 0) return;
+ size_t length = (size_t) (result + 1);
+
+ size_t cursor = buffer->length;
+ if (pm_buffer_append_length(buffer, length)) {
+ va_start(arguments, format);
+ vsnprintf(buffer->value + cursor, length, format, arguments);
+ va_end(arguments);
+ buffer->length--;
+ }
+}
+
+/**
+ * Append a string to the buffer.
+ */
+void
+pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length) {
+ pm_buffer_append(buffer, value, length);
+}
+
+/**
+ * Append a list of bytes to the buffer.
+ */
+void
+pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length) {
+ pm_buffer_append(buffer, (const char *) value, length);
+}
+
+/**
+ * Append a single byte to the buffer.
+ */
+void
+pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value) {
+ const void *source = &value;
+ pm_buffer_append(buffer, source, sizeof(uint8_t));
+}
+
+/**
+ * Append a 32-bit unsigned integer to the buffer as a variable-length integer.
+ */
+void
+pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value) {
+ if (value < 128) {
+ pm_buffer_append_byte(buffer, (uint8_t) value);
+ } else {
+ uint32_t n = value;
+ while (n >= 128) {
+ pm_buffer_append_byte(buffer, (uint8_t) (n | 128));
+ n >>= 7;
+ }
+ pm_buffer_append_byte(buffer, (uint8_t) n);
+ }
+}
+
+/**
+ * Append a 32-bit signed integer to the buffer as a variable-length integer.
+ */
+void
+pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value) {
+ uint32_t unsigned_int = ((uint32_t)(value) << 1) ^ ((uint32_t)(value >> 31));
+ pm_buffer_append_varuint(buffer, unsigned_int);
+}
+
+/**
+ * Append a double to the buffer.
+ */
+void
+pm_buffer_append_double(pm_buffer_t *buffer, double value) {
+ const void *source = &value;
+ pm_buffer_append(buffer, source, sizeof(double));
+}
+
+/**
+ * Append a slice of source code to the buffer.
+ */
+void
+pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping) {
+ for (size_t index = 0; index < length; index++) {
+ const uint8_t byte = source[index];
+
+ if ((byte <= 0x06) || (byte >= 0x0E && byte <= 0x1F) || (byte >= 0x7F)) {
+ if (escaping == PM_BUFFER_ESCAPING_RUBY) {
+ pm_buffer_append_format(buffer, "\\x%02X", byte);
+ } else {
+ pm_buffer_append_format(buffer, "\\u%04X", byte);
+ }
+ } else {
+ switch (byte) {
+ case '\a':
+ if (escaping == PM_BUFFER_ESCAPING_RUBY) {
+ pm_buffer_append_string(buffer, "\\a", 2);
+ } else {
+ pm_buffer_append_format(buffer, "\\u%04X", byte);
+ }
+ break;
+ case '\b':
+ pm_buffer_append_string(buffer, "\\b", 2);
+ break;
+ case '\t':
+ pm_buffer_append_string(buffer, "\\t", 2);
+ break;
+ case '\n':
+ pm_buffer_append_string(buffer, "\\n", 2);
+ break;
+ case '\v':
+ if (escaping == PM_BUFFER_ESCAPING_RUBY) {
+ pm_buffer_append_string(buffer, "\\v", 2);
+ } else {
+ pm_buffer_append_format(buffer, "\\u%04X", byte);
+ }
+ break;
+ case '\f':
+ pm_buffer_append_string(buffer, "\\f", 2);
+ break;
+ case '\r':
+ pm_buffer_append_string(buffer, "\\r", 2);
+ break;
+ case '"':
+ pm_buffer_append_string(buffer, "\\\"", 2);
+ break;
+ case '#': {
+ if (escaping == PM_BUFFER_ESCAPING_RUBY && index + 1 < length) {
+ const uint8_t next_byte = source[index + 1];
+ if (next_byte == '{' || next_byte == '@' || next_byte == '$') {
+ pm_buffer_append_byte(buffer, '\\');
+ }
+ }
+
+ pm_buffer_append_byte(buffer, '#');
+ break;
+ }
+ case '\\':
+ pm_buffer_append_string(buffer, "\\\\", 2);
+ break;
+ default:
+ pm_buffer_append_byte(buffer, byte);
+ break;
+ }
+ }
+ }
+}
+
+/**
+ * Prepend the given string to the buffer.
+ */
+void
+pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length) {
+ size_t cursor = buffer->length;
+ if (pm_buffer_append_length(buffer, length)) {
+ memmove(buffer->value + length, buffer->value, cursor);
+ memcpy(buffer->value, value, length);
+ }
+}
+
+/**
+ * Concatenate one buffer onto another.
+ */
+void
+pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source) {
+ if (source->length > 0) {
+ pm_buffer_append(destination, source->value, source->length);
+ }
+}
+
+/**
+ * Clear the buffer by reducing its size to 0. This does not free the allocated
+ * memory, but it does allow the buffer to be reused.
+ */
+void
+pm_buffer_clear(pm_buffer_t *buffer) {
+ buffer->length = 0;
+}
+
+/**
+ * Strip the whitespace from the end of the buffer.
+ */
+void
+pm_buffer_rstrip(pm_buffer_t *buffer) {
+ while (buffer->length > 0 && pm_char_is_whitespace((uint8_t) buffer->value[buffer->length - 1])) {
+ buffer->length--;
+ }
+}
+
+/**
+ * Checks if the buffer includes the given value.
+ */
+size_t
+pm_buffer_index(const pm_buffer_t *buffer, char value) {
+ const char *first = memchr(buffer->value, value, buffer->length);
+ return (first == NULL) ? SIZE_MAX : (size_t) (first - buffer->value);
+}
+
+/**
+ * Insert the given string into the buffer at the given index.
+ */
+void
+pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length) {
+ assert(index <= buffer->length);
+
+ if (index == buffer->length) {
+ pm_buffer_append_string(buffer, value, length);
+ } else {
+ pm_buffer_append_zeroes(buffer, length);
+ memmove(buffer->value + index + length, buffer->value + index, buffer->length - length - index);
+ memcpy(buffer->value + index, value, length);
+ }
+}
+
+/**
+ * Free the memory associated with the buffer.
+ */
+void
+pm_buffer_free(pm_buffer_t *buffer) {
+ xfree(buffer->value);
+}
diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h
new file mode 100644
index 0000000000..58f7b506cf
--- /dev/null
+++ b/prism/util/pm_buffer.h
@@ -0,0 +1,218 @@
+/**
+ * @file pm_buffer.h
+ *
+ * A wrapper around a contiguous block of allocated memory.
+ */
+#ifndef PRISM_BUFFER_H
+#define PRISM_BUFFER_H
+
+#include "prism/defines.h"
+#include "prism/util/pm_char.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/**
+ * A pm_buffer_t is a simple memory buffer that stores data in a contiguous
+ * block of memory.
+ */
+typedef struct {
+ /** The length of the buffer in bytes. */
+ size_t length;
+
+ /** The capacity of the buffer in bytes that has been allocated. */
+ size_t capacity;
+
+ /** A pointer to the start of the buffer. */
+ char *value;
+} pm_buffer_t;
+
+/**
+ * Return the size of the pm_buffer_t struct.
+ *
+ * @returns The size of the pm_buffer_t struct.
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void);
+
+/**
+ * Initialize a pm_buffer_t with the given capacity.
+ *
+ * @param buffer The buffer to initialize.
+ * @param capacity The capacity of the buffer.
+ * @returns True if the buffer was initialized successfully, false otherwise.
+ */
+bool pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity);
+
+/**
+ * Initialize a pm_buffer_t with its default values.
+ *
+ * @param buffer The buffer to initialize.
+ * @returns True if the buffer was initialized successfully, false otherwise.
+ */
+PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer);
+
+/**
+ * Return the value of the buffer.
+ *
+ * @param buffer The buffer to get the value of.
+ * @returns The value of the buffer.
+ */
+PRISM_EXPORTED_FUNCTION char * pm_buffer_value(const pm_buffer_t *buffer);
+
+/**
+ * Return the length of the buffer.
+ *
+ * @param buffer The buffer to get the length of.
+ * @returns The length of the buffer.
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(const pm_buffer_t *buffer);
+
+/**
+ * Append the given amount of space as zeroes to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param length The amount of space to append and zero.
+ */
+void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length);
+
+/**
+ * Append a formatted string to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param format The format string to append.
+ * @param ... The arguments to the format string.
+ */
+void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3);
+
+/**
+ * Append a string to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The string to append.
+ * @param length The length of the string to append.
+ */
+void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length);
+
+/**
+ * Append a list of bytes to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The bytes to append.
+ * @param length The length of the bytes to append.
+ */
+void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length);
+
+/**
+ * Append a single byte to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The byte to append.
+ */
+void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value);
+
+/**
+ * Append a 32-bit unsigned integer to the buffer as a variable-length integer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The integer to append.
+ */
+void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value);
+
+/**
+ * Append a 32-bit signed integer to the buffer as a variable-length integer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The integer to append.
+ */
+void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value);
+
+/**
+ * Append a double to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param value The double to append.
+ */
+void pm_buffer_append_double(pm_buffer_t *buffer, double value);
+
+/**
+ * The different types of escaping that can be performed by the buffer when
+ * appending a slice of Ruby source code.
+ */
+typedef enum {
+ PM_BUFFER_ESCAPING_RUBY,
+ PM_BUFFER_ESCAPING_JSON
+} pm_buffer_escaping_t;
+
+/**
+ * Append a slice of source code to the buffer.
+ *
+ * @param buffer The buffer to append to.
+ * @param source The source code to append.
+ * @param length The length of the source code to append.
+ * @param escaping The type of escaping to perform.
+ */
+void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping);
+
+/**
+ * Prepend the given string to the buffer.
+ *
+ * @param buffer The buffer to prepend to.
+ * @param value The string to prepend.
+ * @param length The length of the string to prepend.
+ */
+void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length);
+
+/**
+ * Concatenate one buffer onto another.
+ *
+ * @param destination The buffer to concatenate onto.
+ * @param source The buffer to concatenate.
+ */
+void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source);
+
+/**
+ * Clear the buffer by reducing its size to 0. This does not free the allocated
+ * memory, but it does allow the buffer to be reused.
+ *
+ * @param buffer The buffer to clear.
+ */
+void pm_buffer_clear(pm_buffer_t *buffer);
+
+/**
+ * Strip the whitespace from the end of the buffer.
+ *
+ * @param buffer The buffer to strip.
+ */
+void pm_buffer_rstrip(pm_buffer_t *buffer);
+
+/**
+ * Checks if the buffer includes the given value.
+ *
+ * @param buffer The buffer to check.
+ * @param value The value to check for.
+ * @returns The index of the first occurrence of the value in the buffer, or
+ * SIZE_MAX if the value is not found.
+ */
+size_t pm_buffer_index(const pm_buffer_t *buffer, char value);
+
+/**
+ * Insert the given string into the buffer at the given index.
+ *
+ * @param buffer The buffer to insert into.
+ * @param index The index to insert at.
+ * @param value The string to insert.
+ * @param length The length of the string to insert.
+ */
+void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length);
+
+/**
+ * Free the memory associated with the buffer.
+ *
+ * @param buffer The buffer to free.
+ */
+PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer);
+
+#endif
diff --git a/prism/util/pm_char.c b/prism/util/pm_char.c
new file mode 100644
index 0000000000..dce19abd1b
--- /dev/null
+++ b/prism/util/pm_char.c
@@ -0,0 +1,318 @@
+#include "prism/util/pm_char.h"
+
+#define PRISM_CHAR_BIT_WHITESPACE (1 << 0)
+#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1)
+#define PRISM_CHAR_BIT_REGEXP_OPTION (1 << 2)
+
+#define PRISM_NUMBER_BIT_BINARY_DIGIT (1 << 0)
+#define PRISM_NUMBER_BIT_BINARY_NUMBER (1 << 1)
+#define PRISM_NUMBER_BIT_OCTAL_DIGIT (1 << 2)
+#define PRISM_NUMBER_BIT_OCTAL_NUMBER (1 << 3)
+#define PRISM_NUMBER_BIT_DECIMAL_DIGIT (1 << 4)
+#define PRISM_NUMBER_BIT_DECIMAL_NUMBER (1 << 5)
+#define PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT (1 << 6)
+#define PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER (1 << 7)
+
+static const uint8_t pm_byte_table[256] = {
+// 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 0, 0, // 0x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
+ 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3x
+ 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 4x
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 5x
+ 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 6x
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, // 7x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9x
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ax
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Bx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Cx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Dx
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Ex
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // Fx
+};
+
+static const uint8_t pm_number_table[256] = {
+ // 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 0x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 1x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 2x
+ 0xff, 0xff, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xf0, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 3x
+ 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 4x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xaa, // 5x
+ 0x00, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 6x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 7x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 8x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 9x
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ax
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Bx
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Cx
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Dx
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Ex
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Fx
+};
+
+/**
+ * Returns the number of characters at the start of the string that match the
+ * given kind. Disallows searching past the given maximum number of characters.
+ */
+static inline size_t
+pm_strspn_char_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
+ if (length <= 0) return 0;
+
+ size_t size = 0;
+ size_t maximum = (size_t) length;
+
+ while (size < maximum && (pm_byte_table[string[size]] & kind)) size++;
+ return size;
+}
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * whitespace. Disallows searching past the given maximum number of characters.
+ */
+size_t
+pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length) {
+ return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_WHITESPACE);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * whitespace while also tracking the location of each newline. Disallows
+ * searching past the given maximum number of characters.
+ */
+size_t
+pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list) {
+ if (length <= 0) return 0;
+
+ size_t size = 0;
+ size_t maximum = (size_t) length;
+
+ while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_WHITESPACE)) {
+ if (string[size] == '\n') {
+ pm_newline_list_append(newline_list, string + size);
+ }
+
+ size++;
+ }
+
+ return size;
+}
+
+/**
+ * Returns the number of characters at the start of the string that are inline
+ * whitespace. Disallows searching past the given maximum number of characters.
+ */
+size_t
+pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) {
+ return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_INLINE_WHITESPACE);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are regexp
+ * options. Disallows searching past the given maximum number of characters.
+ */
+size_t
+pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length) {
+ return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_REGEXP_OPTION);
+}
+
+/**
+ * Returns true if the given character matches the given kind.
+ */
+static inline bool
+pm_char_is_char_kind(const uint8_t b, uint8_t kind) {
+ return (pm_byte_table[b] & kind) != 0;
+}
+
+/**
+ * Returns true if the given character is a whitespace character.
+ */
+bool
+pm_char_is_whitespace(const uint8_t b) {
+ return pm_char_is_char_kind(b, PRISM_CHAR_BIT_WHITESPACE);
+}
+
+/**
+ * Returns true if the given character is an inline whitespace character.
+ */
+bool
+pm_char_is_inline_whitespace(const uint8_t b) {
+ return pm_char_is_char_kind(b, PRISM_CHAR_BIT_INLINE_WHITESPACE);
+}
+
+/**
+ * Scan through the string and return the number of characters at the start of
+ * the string that match the given kind. Disallows searching past the given
+ * maximum number of characters.
+ */
+static inline size_t
+pm_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) {
+ if (length <= 0) return 0;
+
+ size_t size = 0;
+ size_t maximum = (size_t) length;
+
+ while (size < maximum && (pm_number_table[string[size]] & kind)) size++;
+ return size;
+}
+
+/**
+ * Scan through the string and return the number of characters at the start of
+ * the string that match the given kind. Disallows searching past the given
+ * maximum number of characters.
+ *
+ * Additionally, report the location of the last invalid underscore character
+ * found in the string through the out invalid parameter.
+ */
+static inline size_t
+pm_strspn_number_kind_underscores(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid, uint8_t kind) {
+ if (length <= 0) return 0;
+
+ size_t size = 0;
+ size_t maximum = (size_t) length;
+
+ bool underscore = false;
+ while (size < maximum && (pm_number_table[string[size]] & kind)) {
+ if (string[size] == '_') {
+ if (underscore) *invalid = string + size;
+ underscore = true;
+ } else {
+ underscore = false;
+ }
+
+ size++;
+ }
+
+ if (string[size - 1] == '_') *invalid = string + size - 1;
+ return size;
+}
+
+/**
+ * Returns the number of characters at the start of the string that are binary
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t
+pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+ return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_BINARY_NUMBER);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are octal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t
+pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+ return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_OCTAL_NUMBER);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are decimal
+ * digits. Disallows searching past the given maximum number of characters.
+ */
+size_t
+pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length) {
+ return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_DECIMAL_DIGIT);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are decimal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore
+ */
+size_t
+pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+ return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_DECIMAL_NUMBER);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits. Disallows searching past the given maximum number of
+ * characters.
+ */
+size_t
+pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length) {
+ return pm_strspn_number_kind(string, length, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT);
+}
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits or underscores. Disallows searching past the given maximum
+ * number of characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t
+pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid) {
+ return pm_strspn_number_kind_underscores(string, length, invalid, PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER);
+}
+
+/**
+ * Returns true if the given character matches the given kind.
+ */
+static inline bool
+pm_char_is_number_kind(const uint8_t b, uint8_t kind) {
+ return (pm_number_table[b] & kind) != 0;
+}
+
+/**
+ * Returns true if the given character is a binary digit.
+ */
+bool
+pm_char_is_binary_digit(const uint8_t b) {
+ return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_BINARY_DIGIT);
+}
+
+/**
+ * Returns true if the given character is an octal digit.
+ */
+bool
+pm_char_is_octal_digit(const uint8_t b) {
+ return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_OCTAL_DIGIT);
+}
+
+/**
+ * Returns true if the given character is a decimal digit.
+ */
+bool
+pm_char_is_decimal_digit(const uint8_t b) {
+ return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_DECIMAL_DIGIT);
+}
+
+/**
+ * Returns true if the given character is a hexadecimal digit.
+ */
+bool
+pm_char_is_hexadecimal_digit(const uint8_t b) {
+ return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT);
+}
+
+#undef PRISM_CHAR_BIT_WHITESPACE
+#undef PRISM_CHAR_BIT_INLINE_WHITESPACE
+#undef PRISM_CHAR_BIT_REGEXP_OPTION
+
+#undef PRISM_NUMBER_BIT_BINARY_DIGIT
+#undef PRISM_NUMBER_BIT_BINARY_NUMBER
+#undef PRISM_NUMBER_BIT_OCTAL_DIGIT
+#undef PRISM_NUMBER_BIT_OCTAL_NUMBER
+#undef PRISM_NUMBER_BIT_DECIMAL_DIGIT
+#undef PRISM_NUMBER_BIT_DECIMAL_NUMBER
+#undef PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER
+#undef PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT
diff --git a/prism/util/pm_char.h b/prism/util/pm_char.h
new file mode 100644
index 0000000000..deeafd6321
--- /dev/null
+++ b/prism/util/pm_char.h
@@ -0,0 +1,204 @@
+/**
+ * @file pm_char.h
+ *
+ * Functions for working with characters and strings.
+ */
+#ifndef PRISM_CHAR_H
+#define PRISM_CHAR_H
+
+#include "prism/defines.h"
+#include "prism/util/pm_newline_list.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * whitespace. Disallows searching past the given maximum number of characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @return The number of characters at the start of the string that are
+ * whitespace.
+ */
+size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length);
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * whitespace while also tracking the location of each newline. Disallows
+ * searching past the given maximum number of characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @param newline_list The list of newlines to populate.
+ * @return The number of characters at the start of the string that are
+ * whitespace.
+ */
+size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list);
+
+/**
+ * Returns the number of characters at the start of the string that are inline
+ * whitespace. Disallows searching past the given maximum number of characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @return The number of characters at the start of the string that are inline
+ * whitespace.
+ */
+size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length);
+
+/**
+ * Returns the number of characters at the start of the string that are decimal
+ * digits. Disallows searching past the given maximum number of characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @return The number of characters at the start of the string that are decimal
+ * digits.
+ */
+size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @return The number of characters at the start of the string that are
+ * hexadecimal digits.
+ */
+size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);
+
+/**
+ * Returns the number of characters at the start of the string that are octal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @param invalid The pointer to set to the index of the first invalid
+ * underscore.
+ * @return The number of characters at the start of the string that are octal
+ * digits or underscores.
+ */
+size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/**
+ * Returns the number of characters at the start of the string that are decimal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @param invalid The pointer to set to the index of the first invalid
+ * underscore.
+ * @return The number of characters at the start of the string that are decimal
+ * digits or underscores.
+ */
+size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/**
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits or underscores. Disallows searching past the given maximum
+ * number of characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @param invalid The pointer to set to the index of the first invalid
+ * underscore.
+ * @return The number of characters at the start of the string that are
+ * hexadecimal digits or underscores.
+ */
+size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/**
+ * Returns the number of characters at the start of the string that are regexp
+ * options. Disallows searching past the given maximum number of characters.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @return The number of characters at the start of the string that are regexp
+ * options.
+ */
+size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
+
+/**
+ * Returns the number of characters at the start of the string that are binary
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ *
+ * @param string The string to search.
+ * @param length The maximum number of characters to search.
+ * @param invalid The pointer to set to the index of the first invalid
+ * underscore.
+ * @return The number of characters at the start of the string that are binary
+ * digits or underscores.
+ */
+size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/**
+ * Returns true if the given character is a whitespace character.
+ *
+ * @param b The character to check.
+ * @return True if the given character is a whitespace character.
+ */
+bool pm_char_is_whitespace(const uint8_t b);
+
+/**
+ * Returns true if the given character is an inline whitespace character.
+ *
+ * @param b The character to check.
+ * @return True if the given character is an inline whitespace character.
+ */
+bool pm_char_is_inline_whitespace(const uint8_t b);
+
+/**
+ * Returns true if the given character is a binary digit.
+ *
+ * @param b The character to check.
+ * @return True if the given character is a binary digit.
+ */
+bool pm_char_is_binary_digit(const uint8_t b);
+
+/**
+ * Returns true if the given character is an octal digit.
+ *
+ * @param b The character to check.
+ * @return True if the given character is an octal digit.
+ */
+bool pm_char_is_octal_digit(const uint8_t b);
+
+/**
+ * Returns true if the given character is a decimal digit.
+ *
+ * @param b The character to check.
+ * @return True if the given character is a decimal digit.
+ */
+bool pm_char_is_decimal_digit(const uint8_t b);
+
+/**
+ * Returns true if the given character is a hexadecimal digit.
+ *
+ * @param b The character to check.
+ * @return True if the given character is a hexadecimal digit.
+ */
+bool pm_char_is_hexadecimal_digit(const uint8_t b);
+
+#endif
diff --git a/prism/util/pm_constant_pool.c b/prism/util/pm_constant_pool.c
new file mode 100644
index 0000000000..624002cec9
--- /dev/null
+++ b/prism/util/pm_constant_pool.c
@@ -0,0 +1,338 @@
+#include "prism/util/pm_constant_pool.h"
+
+/**
+ * Initialize a list of constant ids.
+ */
+void
+pm_constant_id_list_init(pm_constant_id_list_t *list) {
+ list->ids = NULL;
+ list->size = 0;
+ list->capacity = 0;
+}
+
+/**
+ * Initialize a list of constant ids with a given capacity.
+ */
+void
+pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity) {
+ list->ids = xcalloc(capacity, sizeof(pm_constant_id_t));
+ if (list->ids == NULL) abort();
+
+ list->size = 0;
+ list->capacity = capacity;
+}
+
+/**
+ * Append a constant id to a list of constant ids. Returns false if any
+ * potential reallocations fail.
+ */
+bool
+pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id) {
+ if (list->size >= list->capacity) {
+ list->capacity = list->capacity == 0 ? 8 : list->capacity * 2;
+ list->ids = (pm_constant_id_t *) xrealloc(list->ids, sizeof(pm_constant_id_t) * list->capacity);
+ if (list->ids == NULL) return false;
+ }
+
+ list->ids[list->size++] = id;
+ return true;
+}
+
+/**
+ * Insert a constant id into a list of constant ids at the specified index.
+ */
+void
+pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id) {
+ assert(index < list->capacity);
+ assert(list->ids[index] == PM_CONSTANT_ID_UNSET);
+
+ list->ids[index] = id;
+ list->size++;
+}
+
+/**
+ * Checks if the current constant id list includes the given constant id.
+ */
+bool
+pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id) {
+ for (size_t index = 0; index < list->size; index++) {
+ if (list->ids[index] == id) return true;
+ }
+ return false;
+}
+
+/**
+ * Free the memory associated with a list of constant ids.
+ */
+void
+pm_constant_id_list_free(pm_constant_id_list_t *list) {
+ if (list->ids != NULL) {
+ xfree(list->ids);
+ }
+}
+
+/**
+ * A relatively simple hash function (djb2) that is used to hash strings. We are
+ * optimizing here for simplicity and speed.
+ */
+static inline uint32_t
+pm_constant_pool_hash(const uint8_t *start, size_t length) {
+ // This is a prime number used as the initial value for the hash function.
+ uint32_t value = 5381;
+
+ for (size_t index = 0; index < length; index++) {
+ value = ((value << 5) + value) + start[index];
+ }
+
+ return value;
+}
+
+/**
+ * https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+ */
+static uint32_t
+next_power_of_two(uint32_t v) {
+ // Avoid underflow in subtraction on next line.
+ if (v == 0) {
+ // 1 is the nearest power of 2 to 0 (2^0)
+ return 1;
+ }
+ v--;
+ v |= v >> 1;
+ v |= v >> 2;
+ v |= v >> 4;
+ v |= v >> 8;
+ v |= v >> 16;
+ v++;
+ return v;
+}
+
+#ifndef NDEBUG
+static bool
+is_power_of_two(uint32_t size) {
+ return (size & (size - 1)) == 0;
+}
+#endif
+
+/**
+ * Resize a constant pool to a given capacity.
+ */
+static inline bool
+pm_constant_pool_resize(pm_constant_pool_t *pool) {
+ assert(is_power_of_two(pool->capacity));
+
+ uint32_t next_capacity = pool->capacity * 2;
+ if (next_capacity < pool->capacity) return false;
+
+ const uint32_t mask = next_capacity - 1;
+ const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t);
+
+ void *next = xcalloc(next_capacity, element_size);
+ if (next == NULL) return false;
+
+ pm_constant_pool_bucket_t *next_buckets = next;
+ pm_constant_t *next_constants = (void *)(((char *) next) + next_capacity * sizeof(pm_constant_pool_bucket_t));
+
+ // For each bucket in the current constant pool, find the index in the
+ // next constant pool, and insert it.
+ for (uint32_t index = 0; index < pool->capacity; index++) {
+ pm_constant_pool_bucket_t *bucket = &pool->buckets[index];
+
+ // If an id is set on this constant, then we know we have content here.
+ // In this case we need to insert it into the next constant pool.
+ if (bucket->id != PM_CONSTANT_ID_UNSET) {
+ uint32_t next_index = bucket->hash & mask;
+
+ // This implements linear scanning to find the next available slot
+ // in case this index is already taken. We don't need to bother
+ // comparing the values since we know that the hash is unique.
+ while (next_buckets[next_index].id != PM_CONSTANT_ID_UNSET) {
+ next_index = (next_index + 1) & mask;
+ }
+
+ // Here we copy over the entire bucket, which includes the id so
+ // that they are consistent between resizes.
+ next_buckets[next_index] = *bucket;
+ }
+ }
+
+ // The constants are stable with respect to hash table resizes.
+ memcpy(next_constants, pool->constants, pool->size * sizeof(pm_constant_t));
+
+ // pool->constants and pool->buckets are allocated out of the same chunk
+ // of memory, with the buckets coming first.
+ xfree(pool->buckets);
+ pool->constants = next_constants;
+ pool->buckets = next_buckets;
+ pool->capacity = next_capacity;
+ return true;
+}
+
+/**
+ * Initialize a new constant pool with a given capacity.
+ */
+bool
+pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity) {
+ const uint32_t maximum = (~((uint32_t) 0));
+ if (capacity >= ((maximum / 2) + 1)) return false;
+
+ capacity = next_power_of_two(capacity);
+ const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t);
+ void *memory = xcalloc(capacity, element_size);
+ if (memory == NULL) return false;
+
+ pool->buckets = memory;
+ pool->constants = (void *)(((char *)memory) + capacity * sizeof(pm_constant_pool_bucket_t));
+ pool->size = 0;
+ pool->capacity = capacity;
+ return true;
+}
+
+/**
+ * Return a pointer to the constant indicated by the given constant id.
+ */
+pm_constant_t *
+pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id) {
+ assert(constant_id != PM_CONSTANT_ID_UNSET && constant_id <= pool->size);
+ return &pool->constants[constant_id - 1];
+}
+
+/**
+ * Find a constant in a constant pool. Returns the id of the constant, or 0 if
+ * the constant is not found.
+ */
+pm_constant_id_t
+pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
+ assert(is_power_of_two(pool->capacity));
+ const uint32_t mask = pool->capacity - 1;
+
+ uint32_t hash = pm_constant_pool_hash(start, length);
+ uint32_t index = hash & mask;
+ pm_constant_pool_bucket_t *bucket;
+
+ while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) {
+ pm_constant_t *constant = &pool->constants[bucket->id - 1];
+ if ((constant->length == length) && memcmp(constant->start, start, length) == 0) {
+ return bucket->id;
+ }
+
+ index = (index + 1) & mask;
+ }
+
+ return PM_CONSTANT_ID_UNSET;
+}
+
+/**
+ * Insert a constant into a constant pool and return its index in the pool.
+ */
+static inline pm_constant_id_t
+pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t length, pm_constant_pool_bucket_type_t type) {
+ if (pool->size >= (pool->capacity / 4 * 3)) {
+ if (!pm_constant_pool_resize(pool)) return PM_CONSTANT_ID_UNSET;
+ }
+
+ assert(is_power_of_two(pool->capacity));
+ const uint32_t mask = pool->capacity - 1;
+
+ uint32_t hash = pm_constant_pool_hash(start, length);
+ uint32_t index = hash & mask;
+ pm_constant_pool_bucket_t *bucket;
+
+ while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) {
+ // If there is a collision, then we need to check if the content is the
+ // same as the content we are trying to insert. If it is, then we can
+ // return the id of the existing constant.
+ pm_constant_t *constant = &pool->constants[bucket->id - 1];
+
+ if ((constant->length == length) && memcmp(constant->start, start, length) == 0) {
+ // Since we have found a match, we need to check if this is
+ // attempting to insert a shared or an owned constant. We want to
+ // prefer shared constants since they don't require allocations.
+ if (type == PM_CONSTANT_POOL_BUCKET_OWNED) {
+ // If we're attempting to insert an owned constant and we have
+ // an existing constant, then either way we don't want the given
+ // memory. Either it's duplicated with the existing constant or
+ // it's not necessary because we have a shared version.
+ xfree((void *) start);
+ } else if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) {
+ // If we're attempting to insert a shared constant and the
+ // existing constant is owned, then we can free the owned
+ // constant and replace it with the shared constant.
+ xfree((void *) constant->start);
+ constant->start = start;
+ bucket->type = (unsigned int) (PM_CONSTANT_POOL_BUCKET_DEFAULT & 0x3);
+ }
+
+ return bucket->id;
+ }
+
+ index = (index + 1) & mask;
+ }
+
+ // IDs are allocated starting at 1, since the value 0 denotes a non-existent
+ // constant.
+ uint32_t id = ++pool->size;
+ assert(pool->size < ((uint32_t) (1 << 30)));
+
+ *bucket = (pm_constant_pool_bucket_t) {
+ .id = (unsigned int) (id & 0x3fffffff),
+ .type = (unsigned int) (type & 0x3),
+ .hash = hash
+ };
+
+ pool->constants[id - 1] = (pm_constant_t) {
+ .start = start,
+ .length = length,
+ };
+
+ return id;
+}
+
+/**
+ * Insert a constant into a constant pool. Returns the id of the constant, or
+ * PM_CONSTANT_ID_UNSET if any potential calls to resize fail.
+ */
+pm_constant_id_t
+pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
+ return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_DEFAULT);
+}
+
+/**
+ * Insert a constant into a constant pool from memory that is now owned by the
+ * constant pool. Returns the id of the constant, or PM_CONSTANT_ID_UNSET if any
+ * potential calls to resize fail.
+ */
+pm_constant_id_t
+pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length) {
+ return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_OWNED);
+}
+
+/**
+ * Insert a constant into a constant pool from memory that is constant. Returns
+ * the id of the constant, or PM_CONSTANT_ID_UNSET if any potential calls to
+ * resize fail.
+ */
+pm_constant_id_t
+pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length) {
+ return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_CONSTANT);
+}
+
+/**
+ * Free the memory associated with a constant pool.
+ */
+void
+pm_constant_pool_free(pm_constant_pool_t *pool) {
+ // For each constant in the current constant pool, free the contents if the
+ // contents are owned.
+ for (uint32_t index = 0; index < pool->capacity; index++) {
+ pm_constant_pool_bucket_t *bucket = &pool->buckets[index];
+
+ // If an id is set on this constant, then we know we have content here.
+ if (bucket->id != PM_CONSTANT_ID_UNSET && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) {
+ pm_constant_t *constant = &pool->constants[bucket->id - 1];
+ xfree((void *) constant->start);
+ }
+ }
+
+ xfree(pool->buckets);
+}
diff --git a/prism/util/pm_constant_pool.h b/prism/util/pm_constant_pool.h
new file mode 100644
index 0000000000..6df23f8f50
--- /dev/null
+++ b/prism/util/pm_constant_pool.h
@@ -0,0 +1,218 @@
+/**
+ * @file pm_constant_pool.h
+ *
+ * A data structure that stores a set of strings.
+ *
+ * Each string is assigned a unique id, which can be used to compare strings for
+ * equality. This comparison ends up being much faster than strcmp, since it
+ * only requires a single integer comparison.
+ */
+#ifndef PRISM_CONSTANT_POOL_H
+#define PRISM_CONSTANT_POOL_H
+
+#include "prism/defines.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/**
+ * When we allocate constants into the pool, we reserve 0 to mean that the slot
+ * is not yet filled. This constant is reused in other places to indicate the
+ * lack of a constant id.
+ */
+#define PM_CONSTANT_ID_UNSET 0
+
+/**
+ * A constant id is a unique identifier for a constant in the constant pool.
+ */
+typedef uint32_t pm_constant_id_t;
+
+/**
+ * A list of constant IDs. Usually used to represent a set of locals.
+ */
+typedef struct {
+ /** The number of constant ids in the list. */
+ size_t size;
+
+ /** The number of constant ids that have been allocated in the list. */
+ size_t capacity;
+
+ /** The constant ids in the list. */
+ pm_constant_id_t *ids;
+} pm_constant_id_list_t;
+
+/**
+ * Initialize a list of constant ids.
+ *
+ * @param list The list to initialize.
+ */
+void pm_constant_id_list_init(pm_constant_id_list_t *list);
+
+/**
+ * Initialize a list of constant ids with a given capacity.
+ *
+ * @param list The list to initialize.
+ * @param capacity The initial capacity of the list.
+ */
+void pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity);
+
+/**
+ * Append a constant id to a list of constant ids. Returns false if any
+ * potential reallocations fail.
+ *
+ * @param list The list to append to.
+ * @param id The id to append.
+ * @return Whether the append succeeded.
+ */
+bool pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id);
+
+/**
+ * Insert a constant id into a list of constant ids at the specified index.
+ *
+ * @param list The list to insert into.
+ * @param index The index at which to insert.
+ * @param id The id to insert.
+ */
+void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id);
+
+/**
+ * Checks if the current constant id list includes the given constant id.
+ *
+ * @param list The list to check.
+ * @param id The id to check for.
+ * @return Whether the list includes the given id.
+ */
+bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id);
+
+/**
+ * Free the memory associated with a list of constant ids.
+ *
+ * @param list The list to free.
+ */
+void pm_constant_id_list_free(pm_constant_id_list_t *list);
+
+/**
+ * The type of bucket in the constant pool hash map. This determines how the
+ * bucket should be freed.
+ */
+typedef unsigned int pm_constant_pool_bucket_type_t;
+
+/** By default, each constant is a slice of the source. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0;
+
+/** An owned constant is one for which memory has been allocated. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1;
+
+/** A constant constant is known at compile time. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2;
+
+/** A bucket in the hash map. */
+typedef struct {
+ /** The incremental ID used for indexing back into the pool. */
+ unsigned int id: 30;
+
+ /** The type of the bucket, which determines how to free it. */
+ pm_constant_pool_bucket_type_t type: 2;
+
+ /** The hash of the bucket. */
+ uint32_t hash;
+} pm_constant_pool_bucket_t;
+
+/** A constant in the pool which effectively stores a string. */
+typedef struct {
+ /** A pointer to the start of the string. */
+ const uint8_t *start;
+
+ /** The length of the string. */
+ size_t length;
+} pm_constant_t;
+
+/** The overall constant pool, which stores constants found while parsing. */
+typedef struct {
+ /** The buckets in the hash map. */
+ pm_constant_pool_bucket_t *buckets;
+
+ /** The constants that are stored in the buckets. */
+ pm_constant_t *constants;
+
+ /** The number of buckets in the hash map. */
+ uint32_t size;
+
+ /** The number of buckets that have been allocated in the hash map. */
+ uint32_t capacity;
+} pm_constant_pool_t;
+
+/**
+ * Initialize a new constant pool with a given capacity.
+ *
+ * @param pool The pool to initialize.
+ * @param capacity The initial capacity of the pool.
+ * @return Whether the initialization succeeded.
+ */
+bool pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity);
+
+/**
+ * Return a pointer to the constant indicated by the given constant id.
+ *
+ * @param pool The pool to get the constant from.
+ * @param constant_id The id of the constant to get.
+ * @return A pointer to the constant.
+ */
+pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id);
+
+/**
+ * Find a constant in a constant pool. Returns the id of the constant, or 0 if
+ * the constant is not found.
+ *
+ * @param pool The pool to find the constant in.
+ * @param start A pointer to the start of the constant.
+ * @param length The length of the constant.
+ * @return The id of the constant.
+ */
+pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+/**
+ * Insert a constant into a constant pool that is a slice of a source string.
+ * Returns the id of the constant, or 0 if any potential calls to resize fail.
+ *
+ * @param pool The pool to insert the constant into.
+ * @param start A pointer to the start of the constant.
+ * @param length The length of the constant.
+ * @return The id of the constant.
+ */
+pm_constant_id_t pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+/**
+ * Insert a constant into a constant pool from memory that is now owned by the
+ * constant pool. Returns the id of the constant, or 0 if any potential calls to
+ * resize fail.
+ *
+ * @param pool The pool to insert the constant into.
+ * @param start A pointer to the start of the constant.
+ * @param length The length of the constant.
+ * @return The id of the constant.
+ */
+pm_constant_id_t pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length);
+
+/**
+ * Insert a constant into a constant pool from memory that is constant. Returns
+ * the id of the constant, or 0 if any potential calls to resize fail.
+ *
+ * @param pool The pool to insert the constant into.
+ * @param start A pointer to the start of the constant.
+ * @param length The length of the constant.
+ * @return The id of the constant.
+ */
+pm_constant_id_t pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+/**
+ * Free the memory associated with a constant pool.
+ *
+ * @param pool The pool to free.
+ */
+void pm_constant_pool_free(pm_constant_pool_t *pool);
+
+#endif
diff --git a/prism/util/pm_integer.c b/prism/util/pm_integer.c
new file mode 100644
index 0000000000..5b0f34465c
--- /dev/null
+++ b/prism/util/pm_integer.c
@@ -0,0 +1,670 @@
+#include "prism/util/pm_integer.h"
+
+/**
+ * Pull out the length and values from the integer, regardless of the form in
+ * which the length/values are stored.
+ */
+#define INTEGER_EXTRACT(integer, length_variable, values_variable) \
+ if ((integer)->values == NULL) { \
+ length_variable = 1; \
+ values_variable = &(integer)->value; \
+ } else { \
+ length_variable = (integer)->length; \
+ values_variable = (integer)->values; \
+ }
+
+/**
+ * Adds two positive pm_integer_t with the given base.
+ * Return pm_integer_t with values allocated. Not normalized.
+ */
+static void
+big_add(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) {
+ size_t left_length;
+ uint32_t *left_values;
+ INTEGER_EXTRACT(left, left_length, left_values)
+
+ size_t right_length;
+ uint32_t *right_values;
+ INTEGER_EXTRACT(right, right_length, right_values)
+
+ size_t length = left_length < right_length ? right_length : left_length;
+ uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * (length + 1));
+ if (values == NULL) return;
+
+ uint64_t carry = 0;
+ for (size_t index = 0; index < length; index++) {
+ uint64_t sum = carry + (index < left_length ? left_values[index] : 0) + (index < right_length ? right_values[index] : 0);
+ values[index] = (uint32_t) (sum % base);
+ carry = sum / base;
+ }
+
+ if (carry > 0) {
+ values[length] = (uint32_t) carry;
+ length++;
+ }
+
+ *destination = (pm_integer_t) { 0, length, values, false };
+}
+
+/**
+ * Internal use for karatsuba_multiply. Calculates `a - b - c` with the given
+ * base. Assume a, b, c, a - b - c all to be positive.
+ * Return pm_integer_t with values allocated. Not normalized.
+ */
+static void
+big_sub2(pm_integer_t *destination, pm_integer_t *a, pm_integer_t *b, pm_integer_t *c, uint64_t base) {
+ size_t a_length;
+ uint32_t *a_values;
+ INTEGER_EXTRACT(a, a_length, a_values)
+
+ size_t b_length;
+ uint32_t *b_values;
+ INTEGER_EXTRACT(b, b_length, b_values)
+
+ size_t c_length;
+ uint32_t *c_values;
+ INTEGER_EXTRACT(c, c_length, c_values)
+
+ uint32_t *values = (uint32_t*) xmalloc(sizeof(uint32_t) * a_length);
+ int64_t carry = 0;
+
+ for (size_t index = 0; index < a_length; index++) {
+ int64_t sub = (
+ carry +
+ a_values[index] -
+ (index < b_length ? b_values[index] : 0) -
+ (index < c_length ? c_values[index] : 0)
+ );
+
+ if (sub >= 0) {
+ values[index] = (uint32_t) sub;
+ carry = 0;
+ } else {
+ sub += 2 * (int64_t) base;
+ values[index] = (uint32_t) ((uint64_t) sub % base);
+ carry = sub / (int64_t) base - 2;
+ }
+ }
+
+ while (a_length > 1 && values[a_length - 1] == 0) a_length--;
+ *destination = (pm_integer_t) { 0, a_length, values, false };
+}
+
+/**
+ * Multiply two positive integers with the given base using karatsuba algorithm.
+ * Return pm_integer_t with values allocated. Not normalized.
+ */
+static void
+karatsuba_multiply(pm_integer_t *destination, pm_integer_t *left, pm_integer_t *right, uint64_t base) {
+ size_t left_length;
+ uint32_t *left_values;
+ INTEGER_EXTRACT(left, left_length, left_values)
+
+ size_t right_length;
+ uint32_t *right_values;
+ INTEGER_EXTRACT(right, right_length, right_values)
+
+ if (left_length > right_length) {
+ size_t temporary_length = left_length;
+ left_length = right_length;
+ right_length = temporary_length;
+
+ uint32_t *temporary_values = left_values;
+ left_values = right_values;
+ right_values = temporary_values;
+ }
+
+ if (left_length <= 10) {
+ size_t length = left_length + right_length;
+ uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t));
+ if (values == NULL) return;
+
+ for (size_t left_index = 0; left_index < left_length; left_index++) {
+ uint32_t carry = 0;
+ for (size_t right_index = 0; right_index < right_length; right_index++) {
+ uint64_t product = (uint64_t) left_values[left_index] * right_values[right_index] + values[left_index + right_index] + carry;
+ values[left_index + right_index] = (uint32_t) (product % base);
+ carry = (uint32_t) (product / base);
+ }
+ values[left_index + right_length] = carry;
+ }
+
+ while (length > 1 && values[length - 1] == 0) length--;
+ *destination = (pm_integer_t) { 0, length, values, false };
+ return;
+ }
+
+ if (left_length * 2 <= right_length) {
+ uint32_t *values = (uint32_t *) xcalloc(left_length + right_length, sizeof(uint32_t));
+
+ for (size_t start_offset = 0; start_offset < right_length; start_offset += left_length) {
+ size_t end_offset = start_offset + left_length;
+ if (end_offset > right_length) end_offset = right_length;
+
+ pm_integer_t sliced_left = {
+ .value = 0,
+ .length = left_length,
+ .values = left_values,
+ .negative = false
+ };
+
+ pm_integer_t sliced_right = {
+ .value = 0,
+ .length = end_offset - start_offset,
+ .values = right_values + start_offset,
+ .negative = false
+ };
+
+ pm_integer_t product;
+ karatsuba_multiply(&product, &sliced_left, &sliced_right, base);
+
+ uint32_t carry = 0;
+ for (size_t index = 0; index < product.length; index++) {
+ uint64_t sum = (uint64_t) values[start_offset + index] + product.values[index] + carry;
+ values[start_offset + index] = (uint32_t) (sum % base);
+ carry = (uint32_t) (sum / base);
+ }
+
+ if (carry > 0) values[start_offset + product.length] += carry;
+ pm_integer_free(&product);
+ }
+
+ *destination = (pm_integer_t) { 0, left_length + right_length, values, false };
+ return;
+ }
+
+ size_t half = left_length / 2;
+ pm_integer_t x0 = { 0, half, left_values, false };
+ pm_integer_t x1 = { 0, left_length - half, left_values + half, false };
+ pm_integer_t y0 = { 0, half, right_values, false };
+ pm_integer_t y1 = { 0, right_length - half, right_values + half, false };
+
+ pm_integer_t z0 = { 0 };
+ karatsuba_multiply(&z0, &x0, &y0, base);
+
+ pm_integer_t z2 = { 0 };
+ karatsuba_multiply(&z2, &x1, &y1, base);
+
+ // For simplicity to avoid considering negative values,
+ // use `z1 = (x0 + x1) * (y0 + y1) - z0 - z2` instead of original karatsuba algorithm.
+ pm_integer_t x01 = { 0 };
+ big_add(&x01, &x0, &x1, base);
+
+ pm_integer_t y01 = { 0 };
+ big_add(&y01, &y0, &y1, base);
+
+ pm_integer_t xy = { 0 };
+ karatsuba_multiply(&xy, &x01, &y01, base);
+
+ pm_integer_t z1;
+ big_sub2(&z1, &xy, &z0, &z2, base);
+
+ size_t length = left_length + right_length;
+ uint32_t *values = (uint32_t*) xcalloc(length, sizeof(uint32_t));
+
+ assert(z0.values != NULL);
+ memcpy(values, z0.values, sizeof(uint32_t) * z0.length);
+
+ assert(z2.values != NULL);
+ memcpy(values + 2 * half, z2.values, sizeof(uint32_t) * z2.length);
+
+ uint32_t carry = 0;
+ for(size_t index = 0; index < z1.length; index++) {
+ uint64_t sum = (uint64_t) carry + values[index + half] + z1.values[index];
+ values[index + half] = (uint32_t) (sum % base);
+ carry = (uint32_t) (sum / base);
+ }
+
+ for(size_t index = half + z1.length; carry > 0; index++) {
+ uint64_t sum = (uint64_t) carry + values[index];
+ values[index] = (uint32_t) (sum % base);
+ carry = (uint32_t) (sum / base);
+ }
+
+ while (length > 1 && values[length - 1] == 0) length--;
+ pm_integer_free(&z0);
+ pm_integer_free(&z1);
+ pm_integer_free(&z2);
+ pm_integer_free(&x01);
+ pm_integer_free(&y01);
+ pm_integer_free(&xy);
+
+ *destination = (pm_integer_t) { 0, length, values, false };
+}
+
+/**
+ * The values of a hexadecimal digit, where the index is the ASCII character.
+ * Note that there's an odd exception here where _ is mapped to 0. This is
+ * because it's possible for us to end up trying to parse a number that has
+ * already had an error attached to it, and we want to provide _something_ to
+ * the user.
+ */
+static const int8_t pm_integer_parse_digit_values[256] = {
+// 0 1 2 3 4 5 6 7 8 9 A B C D E F
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 0x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 1x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 2x
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, // 3x
+ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 4x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, // 5x
+ -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 6x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 7x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 8x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 9x
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ax
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Bx
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Cx
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Dx
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Ex
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // Fx
+};
+
+/**
+ * Return the value of a hexadecimal digit in a uint8_t.
+ */
+static uint8_t
+pm_integer_parse_digit(const uint8_t character) {
+ int8_t value = pm_integer_parse_digit_values[character];
+ assert(value != -1 && "invalid digit");
+
+ return (uint8_t) value;
+}
+
+/**
+ * Create a pm_integer_t from uint64_t with the given base. It is assumed that
+ * the memory for the pm_integer_t pointer has been zeroed.
+ */
+static void
+pm_integer_from_uint64(pm_integer_t *integer, uint64_t value, uint64_t base) {
+ if (value < base) {
+ integer->value = (uint32_t) value;
+ return;
+ }
+
+ size_t length = 0;
+ uint64_t length_value = value;
+ while (length_value > 0) {
+ length++;
+ length_value /= base;
+ }
+
+ uint32_t *values = (uint32_t *) xmalloc(sizeof(uint32_t) * length);
+ if (values == NULL) return;
+
+ for (size_t value_index = 0; value_index < length; value_index++) {
+ values[value_index] = (uint32_t) (value % base);
+ value /= base;
+ }
+
+ integer->length = length;
+ integer->values = values;
+}
+
+/**
+ * Normalize pm_integer_t.
+ * Heading zero values will be removed. If the integer fits into uint32_t,
+ * values is set to NULL, length is set to 0, and value field will be used.
+ */
+static void
+pm_integer_normalize(pm_integer_t *integer) {
+ if (integer->values == NULL) {
+ return;
+ }
+
+ while (integer->length > 1 && integer->values[integer->length - 1] == 0) {
+ integer->length--;
+ }
+
+ if (integer->length > 1) {
+ return;
+ }
+
+ uint32_t value = integer->values[0];
+ bool negative = integer->negative && value != 0;
+
+ pm_integer_free(integer);
+ *integer = (pm_integer_t) { .value = value, .length = 0, .values = NULL, .negative = negative };
+}
+
+/**
+ * Convert base of the integer.
+ * In practice, it converts 10**9 to 1<<32 or 1<<32 to 10**9.
+ */
+static void
+pm_integer_convert_base(pm_integer_t *destination, const pm_integer_t *source, uint64_t base_from, uint64_t base_to) {
+ size_t source_length;
+ const uint32_t *source_values;
+ INTEGER_EXTRACT(source, source_length, source_values)
+
+ size_t bigints_length = (source_length + 1) / 2;
+ assert(bigints_length > 0);
+
+ pm_integer_t *bigints = (pm_integer_t *) xcalloc(bigints_length, sizeof(pm_integer_t));
+ if (bigints == NULL) return;
+
+ for (size_t index = 0; index < source_length; index += 2) {
+ uint64_t value = source_values[index] + base_from * (index + 1 < source_length ? source_values[index + 1] : 0);
+ pm_integer_from_uint64(&bigints[index / 2], value, base_to);
+ }
+
+ pm_integer_t base = { 0 };
+ pm_integer_from_uint64(&base, base_from, base_to);
+
+ while (bigints_length > 1) {
+ pm_integer_t next_base;
+ karatsuba_multiply(&next_base, &base, &base, base_to);
+
+ pm_integer_free(&base);
+ base = next_base;
+
+ size_t next_length = (bigints_length + 1) / 2;
+ pm_integer_t *next_bigints = (pm_integer_t *) xcalloc(next_length, sizeof(pm_integer_t));
+
+ for (size_t bigints_index = 0; bigints_index < bigints_length; bigints_index += 2) {
+ if (bigints_index + 1 == bigints_length) {
+ next_bigints[bigints_index / 2] = bigints[bigints_index];
+ } else {
+ pm_integer_t multiplied = { 0 };
+ karatsuba_multiply(&multiplied, &base, &bigints[bigints_index + 1], base_to);
+
+ big_add(&next_bigints[bigints_index / 2], &bigints[bigints_index], &multiplied, base_to);
+ pm_integer_free(&bigints[bigints_index]);
+ pm_integer_free(&bigints[bigints_index + 1]);
+ pm_integer_free(&multiplied);
+ }
+ }
+
+ xfree(bigints);
+ bigints = next_bigints;
+ bigints_length = next_length;
+ }
+
+ *destination = bigints[0];
+ destination->negative = source->negative;
+ pm_integer_normalize(destination);
+
+ xfree(bigints);
+ pm_integer_free(&base);
+}
+
+#undef INTEGER_EXTRACT
+
+/**
+ * Convert digits to integer with the given power-of-two base.
+ */
+static void
+pm_integer_parse_powof2(pm_integer_t *integer, uint32_t base, const uint8_t *digits, size_t digits_length) {
+ size_t bit = 1;
+ while (base > (uint32_t) (1 << bit)) bit++;
+
+ size_t length = (digits_length * bit + 31) / 32;
+ uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t));
+
+ for (size_t digit_index = 0; digit_index < digits_length; digit_index++) {
+ size_t bit_position = bit * (digits_length - digit_index - 1);
+ uint32_t value = digits[digit_index];
+
+ size_t index = bit_position / 32;
+ size_t shift = bit_position % 32;
+
+ values[index] |= value << shift;
+ if (32 - shift < bit) values[index + 1] |= value >> (32 - shift);
+ }
+
+ while (length > 1 && values[length - 1] == 0) length--;
+ *integer = (pm_integer_t) { .value = 0, .length = length, .values = values, .negative = false };
+ pm_integer_normalize(integer);
+}
+
+/**
+ * Convert decimal digits to pm_integer_t.
+ */
+static void
+pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t digits_length) {
+ const size_t batch = 9;
+ size_t length = (digits_length + batch - 1) / batch;
+
+ uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t));
+ uint32_t value = 0;
+
+ for (size_t digits_index = 0; digits_index < digits_length; digits_index++) {
+ value = value * 10 + digits[digits_index];
+
+ size_t reverse_index = digits_length - digits_index - 1;
+ if (reverse_index % batch == 0) {
+ values[reverse_index / batch] = value;
+ value = 0;
+ }
+ }
+
+ // Convert base from 10**9 to 1<<32.
+ pm_integer_convert_base(integer, &((pm_integer_t) { .value = 0, .length = length, .values = values, .negative = false }), 1000000000, ((uint64_t) 1 << 32));
+ xfree(values);
+}
+
+/**
+ * Parse a large integer from a string that does not fit into uint32_t.
+ */
+static void
+pm_integer_parse_big(pm_integer_t *integer, uint32_t multiplier, const uint8_t *start, const uint8_t *end) {
+ // Allocate an array to store digits.
+ uint8_t *digits = xmalloc(sizeof(uint8_t) * (size_t) (end - start));
+ size_t digits_length = 0;
+
+ for (; start < end; start++) {
+ if (*start == '_') continue;
+ digits[digits_length++] = pm_integer_parse_digit(*start);
+ }
+
+ // Construct pm_integer_t from the digits.
+ if (multiplier == 10) {
+ pm_integer_parse_decimal(integer, digits, digits_length);
+ } else {
+ pm_integer_parse_powof2(integer, multiplier, digits, digits_length);
+ }
+
+ xfree(digits);
+}
+
+/**
+ * Parse an integer from a string. This assumes that the format of the integer
+ * has already been validated, as internal validation checks are not performed
+ * here.
+ */
+void
+pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end) {
+ // Ignore unary +. Unary - is parsed differently and will not end up here.
+ // Instead, it will modify the parsed integer later.
+ if (*start == '+') start++;
+
+ // Determine the multiplier from the base, and skip past any prefixes.
+ uint32_t multiplier = 10;
+ switch (base) {
+ case PM_INTEGER_BASE_DEFAULT:
+ while (*start == '0') start++; // 01 -> 1
+ break;
+ case PM_INTEGER_BASE_BINARY:
+ start += 2; // 0b
+ multiplier = 2;
+ break;
+ case PM_INTEGER_BASE_OCTAL:
+ start++; // 0
+ if (*start == '_' || *start == 'o' || *start == 'O') start++; // o
+ multiplier = 8;
+ break;
+ case PM_INTEGER_BASE_DECIMAL:
+ if (*start == '0' && (end - start) > 1) start += 2; // 0d
+ break;
+ case PM_INTEGER_BASE_HEXADECIMAL:
+ start += 2; // 0x
+ multiplier = 16;
+ break;
+ case PM_INTEGER_BASE_UNKNOWN:
+ if (*start == '0' && (end - start) > 1) {
+ switch (start[1]) {
+ case '_': start += 2; multiplier = 8; break;
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': start++; multiplier = 8; break;
+ case 'b': case 'B': start += 2; multiplier = 2; break;
+ case 'o': case 'O': start += 2; multiplier = 8; break;
+ case 'd': case 'D': start += 2; break;
+ case 'x': case 'X': start += 2; multiplier = 16; break;
+ default: assert(false && "unreachable"); break;
+ }
+ }
+ break;
+ }
+
+ // It's possible that we've consumed everything at this point if there is an
+ // invalid integer. If this is the case, we'll just return 0.
+ if (start >= end) return;
+
+ const uint8_t *cursor = start;
+ uint64_t value = (uint64_t) pm_integer_parse_digit(*cursor++);
+
+ for (; cursor < end; cursor++) {
+ if (*cursor == '_') continue;
+ value = value * multiplier + (uint64_t) pm_integer_parse_digit(*cursor);
+
+ if (value > UINT32_MAX) {
+ // If the integer is too large to fit into a single uint32_t, then
+ // we'll parse it as a big integer.
+ pm_integer_parse_big(integer, multiplier, start, end);
+ return;
+ }
+ }
+
+ integer->value = (uint32_t) value;
+}
+
+/**
+ * Compare two integers. This function returns -1 if the left integer is less
+ * than the right integer, 0 if they are equal, and 1 if the left integer is
+ * greater than the right integer.
+ */
+int
+pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right) {
+ if (left->negative != right->negative) return left->negative ? -1 : 1;
+ int negative = left->negative ? -1 : 1;
+
+ if (left->values == NULL && right->values == NULL) {
+ if (left->value < right->value) return -1 * negative;
+ if (left->value > right->value) return 1 * negative;
+ return 0;
+ }
+
+ if (left->values == NULL || left->length < right->length) return -1 * negative;
+ if (right->values == NULL || left->length > right->length) return 1 * negative;
+
+ for (size_t index = 0; index < left->length; index++) {
+ size_t value_index = left->length - index - 1;
+ uint32_t left_value = left->values[value_index];
+ uint32_t right_value = right->values[value_index];
+
+ if (left_value < right_value) return -1 * negative;
+ if (left_value > right_value) return 1 * negative;
+ }
+
+ return 0;
+}
+
+/**
+ * Reduce a ratio of integers to its simplest form.
+ */
+void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator) {
+ // If either the numerator or denominator do not fit into a 32-bit integer,
+ // then this function is a no-op. In the future, we may consider reducing
+ // even the larger numbers, but for now we're going to keep it simple.
+ if (
+ // If the numerator doesn't fit into a 32-bit integer, return early.
+ numerator->length != 0 ||
+ // If the denominator doesn't fit into a 32-bit integer, return early.
+ denominator->length != 0 ||
+ // If the numerator is 0, then return early.
+ numerator->value == 0 ||
+ // If the denominator is 1, then return early.
+ denominator->value == 1
+ ) return;
+
+ // Find the greatest common divisor of the numerator and denominator.
+ uint32_t divisor = numerator->value;
+ uint32_t remainder = denominator->value;
+
+ while (remainder != 0) {
+ uint32_t temporary = remainder;
+ remainder = divisor % remainder;
+ divisor = temporary;
+ }
+
+ // Divide the numerator and denominator by the greatest common divisor.
+ numerator->value /= divisor;
+ denominator->value /= divisor;
+}
+
+/**
+ * Convert an integer to a decimal string.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) {
+ if (integer->negative) {
+ pm_buffer_append_byte(buffer, '-');
+ }
+
+ // If the integer fits into a single uint32_t, then we can just append the
+ // value directly to the buffer.
+ if (integer->values == NULL) {
+ pm_buffer_append_format(buffer, "%" PRIu32, integer->value);
+ return;
+ }
+
+ // If the integer is two uint32_t values, then we can | them together and
+ // append the result to the buffer.
+ if (integer->length == 2) {
+ const uint64_t value = ((uint64_t) integer->values[0]) | ((uint64_t) integer->values[1] << 32);
+ pm_buffer_append_format(buffer, "%" PRIu64, value);
+ return;
+ }
+
+ // Otherwise, first we'll convert the base from 1<<32 to 10**9.
+ pm_integer_t converted = { 0 };
+ pm_integer_convert_base(&converted, integer, (uint64_t) 1 << 32, 1000000000);
+
+ if (converted.values == NULL) {
+ pm_buffer_append_format(buffer, "%" PRIu32, converted.value);
+ pm_integer_free(&converted);
+ return;
+ }
+
+ // Allocate a buffer that we'll copy the decimal digits into.
+ size_t digits_length = converted.length * 9;
+ char *digits = xcalloc(digits_length, sizeof(char));
+ if (digits == NULL) return;
+
+ // Pack bigdecimal to digits.
+ for (size_t value_index = 0; value_index < converted.length; value_index++) {
+ uint32_t value = converted.values[value_index];
+
+ for (size_t digit_index = 0; digit_index < 9; digit_index++) {
+ digits[digits_length - 9 * value_index - digit_index - 1] = (char) ('0' + value % 10);
+ value /= 10;
+ }
+ }
+
+ size_t start_offset = 0;
+ while (start_offset < digits_length - 1 && digits[start_offset] == '0') start_offset++;
+
+ // Finally, append the string to the buffer and free the digits.
+ pm_buffer_append_string(buffer, digits + start_offset, digits_length - start_offset);
+ xfree(digits);
+ pm_integer_free(&converted);
+}
+
+/**
+ * Free the internal memory of an integer. This memory will only be allocated if
+ * the integer exceeds the size of a single uint32_t.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_integer_free(pm_integer_t *integer) {
+ if (integer->values) {
+ xfree(integer->values);
+ }
+}
diff --git a/prism/util/pm_integer.h b/prism/util/pm_integer.h
new file mode 100644
index 0000000000..91b28ad2f3
--- /dev/null
+++ b/prism/util/pm_integer.h
@@ -0,0 +1,126 @@
+/**
+ * @file pm_integer.h
+ *
+ * This module provides functions for working with arbitrary-sized integers.
+ */
+#ifndef PRISM_NUMBER_H
+#define PRISM_NUMBER_H
+
+#include "prism/defines.h"
+#include "prism/util/pm_buffer.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/**
+ * A structure represents an arbitrary-sized integer.
+ */
+typedef struct {
+ /**
+ * Embedded value for small integer. This value is set to 0 if the value
+ * does not fit into uint32_t.
+ */
+ uint32_t value;
+
+ /**
+ * The number of allocated values. length is set to 0 if the integer fits
+ * into uint32_t.
+ */
+ size_t length;
+
+ /**
+ * List of 32-bit integers. Set to NULL if the integer fits into uint32_t.
+ */
+ uint32_t *values;
+
+ /**
+ * Whether or not the integer is negative. It is stored this way so that a
+ * zeroed pm_integer_t is always positive zero.
+ */
+ bool negative;
+} pm_integer_t;
+
+/**
+ * An enum controlling the base of an integer. It is expected that the base is
+ * already known before parsing the integer, even though it could be derived
+ * from the string itself.
+ */
+typedef enum {
+ /** The default decimal base, with no prefix. Leading 0s will be ignored. */
+ PM_INTEGER_BASE_DEFAULT,
+
+ /** The binary base, indicated by a 0b or 0B prefix. */
+ PM_INTEGER_BASE_BINARY,
+
+ /** The octal base, indicated by a 0, 0o, or 0O prefix. */
+ PM_INTEGER_BASE_OCTAL,
+
+ /** The decimal base, indicated by a 0d, 0D, or empty prefix. */
+ PM_INTEGER_BASE_DECIMAL,
+
+ /** The hexadecimal base, indicated by a 0x or 0X prefix. */
+ PM_INTEGER_BASE_HEXADECIMAL,
+
+ /**
+ * An unknown base, in which case pm_integer_parse will derive it based on
+ * the content of the string. This is less efficient and does more
+ * comparisons, so if callers know the base ahead of time, they should use
+ * that instead.
+ */
+ PM_INTEGER_BASE_UNKNOWN
+} pm_integer_base_t;
+
+/**
+ * Parse an integer from a string. This assumes that the format of the integer
+ * has already been validated, as internal validation checks are not performed
+ * here.
+ *
+ * @param integer The integer to parse into.
+ * @param base The base of the integer.
+ * @param start The start of the string.
+ * @param end The end of the string.
+ */
+void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end);
+
+/**
+ * Compare two integers. This function returns -1 if the left integer is less
+ * than the right integer, 0 if they are equal, and 1 if the left integer is
+ * greater than the right integer.
+ *
+ * @param left The left integer to compare.
+ * @param right The right integer to compare.
+ * @return The result of the comparison.
+ */
+int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right);
+
+/**
+ * Reduce a ratio of integers to its simplest form.
+ *
+ * If either the numerator or denominator do not fit into a 32-bit integer, then
+ * this function is a no-op. In the future, we may consider reducing even the
+ * larger numbers, but for now we're going to keep it simple.
+ *
+ * @param numerator The numerator of the ratio.
+ * @param denominator The denominator of the ratio.
+ */
+void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator);
+
+/**
+ * Convert an integer to a decimal string.
+ *
+ * @param buffer The buffer to append the string to.
+ * @param integer The integer to convert to a string.
+ */
+PRISM_EXPORTED_FUNCTION void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer);
+
+/**
+ * Free the internal memory of an integer. This memory will only be allocated if
+ * the integer exceeds the size of a single node in the linked list.
+ *
+ * @param integer The integer to free.
+ */
+PRISM_EXPORTED_FUNCTION void pm_integer_free(pm_integer_t *integer);
+
+#endif
diff --git a/prism/util/pm_list.c b/prism/util/pm_list.c
new file mode 100644
index 0000000000..ad2294cd60
--- /dev/null
+++ b/prism/util/pm_list.c
@@ -0,0 +1,49 @@
+#include "prism/util/pm_list.h"
+
+/**
+ * Returns true if the given list is empty.
+ */
+PRISM_EXPORTED_FUNCTION bool
+pm_list_empty_p(pm_list_t *list) {
+ return list->head == NULL;
+}
+
+/**
+ * Returns the size of the list.
+ */
+PRISM_EXPORTED_FUNCTION size_t
+pm_list_size(pm_list_t *list) {
+ return list->size;
+}
+
+/**
+ * Append a node to the given list.
+ */
+void
+pm_list_append(pm_list_t *list, pm_list_node_t *node) {
+ if (list->head == NULL) {
+ list->head = node;
+ } else {
+ list->tail->next = node;
+ }
+
+ list->tail = node;
+ list->size++;
+}
+
+/**
+ * Deallocate the internal state of the given list.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_list_free(pm_list_t *list) {
+ pm_list_node_t *node = list->head;
+ pm_list_node_t *next;
+
+ while (node != NULL) {
+ next = node->next;
+ xfree(node);
+ node = next;
+ }
+
+ list->size = 0;
+}
diff --git a/prism/util/pm_list.h b/prism/util/pm_list.h
new file mode 100644
index 0000000000..3512dee979
--- /dev/null
+++ b/prism/util/pm_list.h
@@ -0,0 +1,97 @@
+/**
+ * @file pm_list.h
+ *
+ * An abstract linked list.
+ */
+#ifndef PRISM_LIST_H
+#define PRISM_LIST_H
+
+#include "prism/defines.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+/**
+ * This struct represents an abstract linked list that provides common
+ * functionality. It is meant to be used any time a linked list is necessary to
+ * store data.
+ *
+ * The linked list itself operates off a set of pointers. Because the pointers
+ * are not necessarily sequential, they can be of any size. We use this fact to
+ * allow the consumer of this linked list to extend the node struct to include
+ * any data they want. This is done by using the pm_list_node_t as the first
+ * member of the struct.
+ *
+ * For example, if we want to store a list of integers, we can do the following:
+ *
+ * ```c
+ * typedef struct {
+ * pm_list_node_t node;
+ * int value;
+ * } pm_int_node_t;
+ *
+ * pm_list_t list = { 0 };
+ * pm_int_node_t *node = xmalloc(sizeof(pm_int_node_t));
+ * node->value = 5;
+ *
+ * pm_list_append(&list, &node->node);
+ * ```
+ *
+ * The pm_list_t struct is used to represent the overall linked list. It
+ * contains a pointer to the head and tail of the list. This allows for easy
+ * iteration and appending of new nodes.
+ */
+typedef struct pm_list_node {
+ /** A pointer to the next node in the list. */
+ struct pm_list_node *next;
+} pm_list_node_t;
+
+/**
+ * This represents the overall linked list. It keeps a pointer to the head and
+ * tail so that iteration is easy and pushing new nodes is easy.
+ */
+typedef struct {
+ /** The size of the list. */
+ size_t size;
+
+ /** A pointer to the head of the list. */
+ pm_list_node_t *head;
+
+ /** A pointer to the tail of the list. */
+ pm_list_node_t *tail;
+} pm_list_t;
+
+/**
+ * Returns true if the given list is empty.
+ *
+ * @param list The list to check.
+ * @return True if the given list is empty, otherwise false.
+ */
+PRISM_EXPORTED_FUNCTION bool pm_list_empty_p(pm_list_t *list);
+
+/**
+ * Returns the size of the list.
+ *
+ * @param list The list to check.
+ * @return The size of the list.
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_list_size(pm_list_t *list);
+
+/**
+ * Append a node to the given list.
+ *
+ * @param list The list to append to.
+ * @param node The node to append.
+ */
+void pm_list_append(pm_list_t *list, pm_list_node_t *node);
+
+/**
+ * Deallocate the internal state of the given list.
+ *
+ * @param list The list to free.
+ */
+PRISM_EXPORTED_FUNCTION void pm_list_free(pm_list_t *list);
+
+#endif
diff --git a/prism/util/pm_memchr.c b/prism/util/pm_memchr.c
new file mode 100644
index 0000000000..7ea20ace6d
--- /dev/null
+++ b/prism/util/pm_memchr.c
@@ -0,0 +1,35 @@
+#include "prism/util/pm_memchr.h"
+
+#define PRISM_MEMCHR_TRAILING_BYTE_MINIMUM 0x40
+
+/**
+ * We need to roll our own memchr to handle cases where the encoding changes and
+ * we need to search for a character in a buffer that could be the trailing byte
+ * of a multibyte character.
+ */
+void *
+pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) {
+ if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) {
+ const uint8_t *source = (const uint8_t *) memory;
+ size_t index = 0;
+
+ while (index < number) {
+ if (source[index] == character) {
+ return (void *) (source + index);
+ }
+
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (number - index));
+ if (width == 0) {
+ return NULL;
+ }
+
+ index += width;
+ }
+
+ return NULL;
+ } else {
+ return memchr(memory, character, number);
+ }
+}
+
+#undef PRISM_MEMCHR_TRAILING_BYTE_MINIMUM
diff --git a/prism/util/pm_memchr.h b/prism/util/pm_memchr.h
new file mode 100644
index 0000000000..e0671eaed3
--- /dev/null
+++ b/prism/util/pm_memchr.h
@@ -0,0 +1,29 @@
+/**
+ * @file pm_memchr.h
+ *
+ * A custom memchr implementation.
+ */
+#ifndef PRISM_MEMCHR_H
+#define PRISM_MEMCHR_H
+
+#include "prism/defines.h"
+#include "prism/encoding.h"
+
+#include <stddef.h>
+
+/**
+ * We need to roll our own memchr to handle cases where the encoding changes and
+ * we need to search for a character in a buffer that could be the trailing byte
+ * of a multibyte character.
+ *
+ * @param source The source string.
+ * @param character The character to search for.
+ * @param number The maximum number of bytes to search.
+ * @param encoding_changed Whether the encoding changed.
+ * @param encoding A pointer to the encoding.
+ * @return A pointer to the first occurrence of the character in the source
+ * string, or NULL if no such character exists.
+ */
+void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding);
+
+#endif
diff --git a/prism/util/pm_newline_list.c b/prism/util/pm_newline_list.c
new file mode 100644
index 0000000000..ce07ce8c8e
--- /dev/null
+++ b/prism/util/pm_newline_list.c
@@ -0,0 +1,96 @@
+#include "prism/util/pm_newline_list.h"
+
+/**
+ * Initialize a new newline list with the given capacity. Returns true if the
+ * allocation of the offsets succeeds, otherwise returns false.
+ */
+bool
+pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) {
+ list->offsets = (size_t *) xcalloc(capacity, sizeof(size_t));
+ if (list->offsets == NULL) return false;
+
+ list->start = start;
+
+ // This is 1 instead of 0 because we want to include the first line of the
+ // file as having offset 0, which is set because of calloc.
+ list->size = 1;
+ list->capacity = capacity;
+
+ return true;
+}
+
+/**
+ * Clear out the newlines that have been appended to the list.
+ */
+void
+pm_newline_list_clear(pm_newline_list_t *list) {
+ list->size = 1;
+}
+
+/**
+ * Append a new offset to the newline list. Returns true if the reallocation of
+ * the offsets succeeds (if one was necessary), otherwise returns false.
+ */
+bool
+pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) {
+ if (list->size == list->capacity) {
+ size_t *original_offsets = list->offsets;
+
+ list->capacity = (list->capacity * 3) / 2;
+ list->offsets = (size_t *) xcalloc(list->capacity, sizeof(size_t));
+ if (list->offsets == NULL) return false;
+
+ memcpy(list->offsets, original_offsets, list->size * sizeof(size_t));
+ xfree(original_offsets);
+ }
+
+ assert(*cursor == '\n');
+ assert(cursor >= list->start);
+ size_t newline_offset = (size_t) (cursor - list->start + 1);
+
+ assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]);
+ list->offsets[list->size++] = newline_offset;
+
+ return true;
+}
+
+/**
+ * Returns the line and column of the given offset. If the offset is not in the
+ * list, the line and column of the closest offset less than the given offset
+ * are returned.
+ */
+pm_line_column_t
+pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) {
+ assert(cursor >= list->start);
+ size_t offset = (size_t) (cursor - list->start);
+
+ size_t left = 0;
+ size_t right = list->size - 1;
+
+ while (left <= right) {
+ size_t mid = left + (right - left) / 2;
+
+ if (list->offsets[mid] == offset) {
+ return ((pm_line_column_t) { ((int32_t) mid) + start_line, 0 });
+ }
+
+ if (list->offsets[mid] < offset) {
+ left = mid + 1;
+ } else {
+ right = mid - 1;
+ }
+ }
+
+ return ((pm_line_column_t) {
+ .line = ((int32_t) left) + start_line - 1,
+ .column = (uint32_t) (offset - list->offsets[left - 1])
+ });
+}
+
+/**
+ * Free the internal memory allocated for the newline list.
+ */
+void
+pm_newline_list_free(pm_newline_list_t *list) {
+ xfree(list->offsets);
+}
diff --git a/prism/util/pm_newline_list.h b/prism/util/pm_newline_list.h
new file mode 100644
index 0000000000..7ae9b6b3da
--- /dev/null
+++ b/prism/util/pm_newline_list.h
@@ -0,0 +1,102 @@
+/**
+ * @file pm_newline_list.h
+ *
+ * A list of byte offsets of newlines in a string.
+ *
+ * When compiling the syntax tree, it's necessary to know the line and column
+ * of many nodes. This is necessary to support things like error messages,
+ * tracepoints, etc.
+ *
+ * It's possible that we could store the start line, start column, end line, and
+ * end column on every node in addition to the offsets that we already store,
+ * but that would be quite a lot of memory overhead.
+ */
+#ifndef PRISM_NEWLINE_LIST_H
+#define PRISM_NEWLINE_LIST_H
+
+#include "prism/defines.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+/**
+ * A list of offsets of newlines in a string. The offsets are assumed to be
+ * sorted/inserted in ascending order.
+ */
+typedef struct {
+ /** A pointer to the start of the source string. */
+ const uint8_t *start;
+
+ /** The number of offsets in the list. */
+ size_t size;
+
+ /** The capacity of the list that has been allocated. */
+ size_t capacity;
+
+ /** The list of offsets. */
+ size_t *offsets;
+} pm_newline_list_t;
+
+/**
+ * A line and column in a string.
+ */
+typedef struct {
+ /** The line number. */
+ int32_t line;
+
+ /** The column number. */
+ uint32_t column;
+} pm_line_column_t;
+
+/**
+ * Initialize a new newline list with the given capacity. Returns true if the
+ * allocation of the offsets succeeds, otherwise returns false.
+ *
+ * @param list The list to initialize.
+ * @param start A pointer to the start of the source string.
+ * @param capacity The initial capacity of the list.
+ * @return True if the allocation of the offsets succeeds, otherwise false.
+ */
+bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity);
+
+/**
+ * Clear out the newlines that have been appended to the list.
+ *
+ * @param list The list to clear.
+ */
+void
+pm_newline_list_clear(pm_newline_list_t *list);
+
+/**
+ * Append a new offset to the newline list. Returns true if the reallocation of
+ * the offsets succeeds (if one was necessary), otherwise returns false.
+ *
+ * @param list The list to append to.
+ * @param cursor A pointer to the offset to append.
+ * @return True if the reallocation of the offsets succeeds (if one was
+ * necessary), otherwise false.
+ */
+bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor);
+
+/**
+ * Returns the line and column of the given offset. If the offset is not in the
+ * list, the line and column of the closest offset less than the given offset
+ * are returned.
+ *
+ * @param list The list to search.
+ * @param cursor A pointer to the offset to search for.
+ * @param start_line The line to start counting from.
+ * @return The line and column of the given offset.
+ */
+pm_line_column_t pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line);
+
+/**
+ * Free the internal memory allocated for the newline list.
+ *
+ * @param list The list to free.
+ */
+void pm_newline_list_free(pm_newline_list_t *list);
+
+#endif
diff --git a/prism/util/pm_string.c b/prism/util/pm_string.c
new file mode 100644
index 0000000000..dfc121b6a2
--- /dev/null
+++ b/prism/util/pm_string.c
@@ -0,0 +1,320 @@
+#include "prism/util/pm_string.h"
+
+/**
+ * Returns the size of the pm_string_t struct. This is necessary to allocate the
+ * correct amount of memory in the FFI backend.
+ */
+PRISM_EXPORTED_FUNCTION size_t
+pm_string_sizeof(void) {
+ return sizeof(pm_string_t);
+}
+
+/**
+ * Initialize a shared string that is based on initial input.
+ */
+void
+pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end) {
+ assert(start <= end);
+
+ *string = (pm_string_t) {
+ .type = PM_STRING_SHARED,
+ .source = start,
+ .length = (size_t) (end - start)
+ };
+}
+
+/**
+ * Initialize an owned string that is responsible for freeing allocated memory.
+ */
+void
+pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) {
+ *string = (pm_string_t) {
+ .type = PM_STRING_OWNED,
+ .source = source,
+ .length = length
+ };
+}
+
+/**
+ * Initialize a constant string that doesn't own its memory source.
+ */
+void
+pm_string_constant_init(pm_string_t *string, const char *source, size_t length) {
+ *string = (pm_string_t) {
+ .type = PM_STRING_CONSTANT,
+ .source = (const uint8_t *) source,
+ .length = length
+ };
+}
+
+/**
+ * Read the file indicated by the filepath parameter into source and load its
+ * contents and size into the given `pm_string_t`. The given `pm_string_t`
+ * should be freed using `pm_string_free` when it is no longer used.
+ *
+ * We want to use demand paging as much as possible in order to avoid having to
+ * read the entire file into memory (which could be detrimental to performance
+ * for large files). This means that if we're on windows we'll use
+ * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
+ * `mmap`, and on other POSIX systems we'll use `read`.
+ */
+PRISM_EXPORTED_FUNCTION bool
+pm_string_mapped_init(pm_string_t *string, const char *filepath) {
+#ifdef _WIN32
+ // Open the file for reading.
+ HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+ return false;
+ }
+
+ // Get the file size.
+ DWORD file_size = GetFileSize(file, NULL);
+ if (file_size == INVALID_FILE_SIZE) {
+ CloseHandle(file);
+ return false;
+ }
+
+ // If the file is empty, then we don't need to do anything else, we'll set
+ // the source to a constant empty string and return.
+ if (file_size == 0) {
+ CloseHandle(file);
+ const uint8_t source[] = "";
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
+ return true;
+ }
+
+ // Create a mapping of the file.
+ HANDLE mapping = CreateFileMapping(file, NULL, PAGE_READONLY, 0, 0, NULL);
+ if (mapping == NULL) {
+ CloseHandle(file);
+ return false;
+ }
+
+ // Map the file into memory.
+ uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0);
+ CloseHandle(mapping);
+ CloseHandle(file);
+
+ if (source == NULL) {
+ return false;
+ }
+
+ *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = (size_t) file_size };
+ return true;
+#elif defined(_POSIX_MAPPED_FILES)
+ // Open the file for reading
+ int fd = open(filepath, O_RDONLY);
+ if (fd == -1) {
+ return false;
+ }
+
+ // Stat the file to get the file size
+ struct stat sb;
+ if (fstat(fd, &sb) == -1) {
+ close(fd);
+ return false;
+ }
+
+ // mmap the file descriptor to virtually get the contents
+ size_t size = (size_t) sb.st_size;
+ uint8_t *source = NULL;
+
+ if (size == 0) {
+ close(fd);
+ const uint8_t source[] = "";
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
+ return true;
+ }
+
+ source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
+ if (source == MAP_FAILED) {
+ return false;
+ }
+
+ close(fd);
+ *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = size };
+ return true;
+#else
+ (void) string;
+ (void) filepath;
+ perror("pm_string_mapped_init is not implemented for this platform");
+ return false;
+#endif
+}
+
+/**
+ * Read the file indicated by the filepath parameter into source and load its
+ * contents and size into the given `pm_string_t`. The given `pm_string_t`
+ * should be freed using `pm_string_free` when it is no longer used.
+ */
+PRISM_EXPORTED_FUNCTION bool
+pm_string_file_init(pm_string_t *string, const char *filepath) {
+#ifdef _WIN32
+ // Open the file for reading.
+ HANDLE file = CreateFile(filepath, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+
+ if (file == INVALID_HANDLE_VALUE) {
+ return false;
+ }
+
+ // Get the file size.
+ DWORD file_size = GetFileSize(file, NULL);
+ if (file_size == INVALID_FILE_SIZE) {
+ CloseHandle(file);
+ return false;
+ }
+
+ // If the file is empty, then we don't need to do anything else, we'll set
+ // the source to a constant empty string and return.
+ if (file_size == 0) {
+ CloseHandle(file);
+ const uint8_t source[] = "";
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
+ return true;
+ }
+
+ // Create a buffer to read the file into.
+ uint8_t *source = xmalloc(file_size);
+ if (source == NULL) {
+ CloseHandle(file);
+ return false;
+ }
+
+ // Read the contents of the file
+ DWORD bytes_read;
+ if (!ReadFile(file, source, file_size, &bytes_read, NULL)) {
+ CloseHandle(file);
+ return false;
+ }
+
+ // Check the number of bytes read
+ if (bytes_read != file_size) {
+ xfree(source);
+ CloseHandle(file);
+ return false;
+ }
+
+ CloseHandle(file);
+ *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size };
+ return true;
+#elif defined(_POSIX_MAPPED_FILES)
+ FILE *file = fopen(filepath, "rb");
+ if (file == NULL) {
+ return false;
+ }
+
+ fseek(file, 0, SEEK_END);
+ long file_size = ftell(file);
+
+ if (file_size == -1) {
+ fclose(file);
+ return false;
+ }
+
+ if (file_size == 0) {
+ fclose(file);
+ const uint8_t source[] = "";
+ *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 };
+ return true;
+ }
+
+ size_t length = (size_t) file_size;
+ uint8_t *source = xmalloc(length);
+ if (source == NULL) {
+ fclose(file);
+ return false;
+ }
+
+ fseek(file, 0, SEEK_SET);
+ size_t bytes_read = fread(source, length, 1, file);
+ fclose(file);
+
+ if (bytes_read != 1) {
+ xfree(source);
+ return false;
+ }
+
+ *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = length };
+ return true;
+#else
+ (void) string;
+ (void) filepath;
+ perror("pm_string_file_init is not implemented for this platform");
+ return false;
+#endif
+}
+
+/**
+ * Ensure the string is owned. If it is not, then reinitialize it as owned and
+ * copy over the previous source.
+ */
+void
+pm_string_ensure_owned(pm_string_t *string) {
+ if (string->type == PM_STRING_OWNED) return;
+
+ size_t length = pm_string_length(string);
+ const uint8_t *source = pm_string_source(string);
+
+ uint8_t *memory = xmalloc(length);
+ if (!memory) return;
+
+ pm_string_owned_init(string, memory, length);
+ memcpy((void *) string->source, source, length);
+}
+
+/**
+ * Compare the underlying lengths and bytes of two strings. Returns 0 if the
+ * strings are equal, a negative number if the left string is less than the
+ * right string, and a positive number if the left string is greater than the
+ * right string.
+ */
+int
+pm_string_compare(const pm_string_t *left, const pm_string_t *right) {
+ size_t left_length = pm_string_length(left);
+ size_t right_length = pm_string_length(right);
+
+ if (left_length < right_length) {
+ return -1;
+ } else if (left_length > right_length) {
+ return 1;
+ }
+
+ return memcmp(pm_string_source(left), pm_string_source(right), left_length);
+}
+
+/**
+ * Returns the length associated with the string.
+ */
+PRISM_EXPORTED_FUNCTION size_t
+pm_string_length(const pm_string_t *string) {
+ return string->length;
+}
+
+/**
+ * Returns the start pointer associated with the string.
+ */
+PRISM_EXPORTED_FUNCTION const uint8_t *
+pm_string_source(const pm_string_t *string) {
+ return string->source;
+}
+
+/**
+ * Free the associated memory of the given string.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_string_free(pm_string_t *string) {
+ void *memory = (void *) string->source;
+
+ if (string->type == PM_STRING_OWNED) {
+ xfree(memory);
+#ifdef PRISM_HAS_MMAP
+ } else if (string->type == PM_STRING_MAPPED && string->length) {
+#if defined(_WIN32)
+ UnmapViewOfFile(memory);
+#elif defined(_POSIX_MAPPED_FILES)
+ munmap(memory, string->length);
+#endif
+#endif /* PRISM_HAS_MMAP */
+ }
+}
diff --git a/prism/util/pm_string.h b/prism/util/pm_string.h
new file mode 100644
index 0000000000..d23792c0ba
--- /dev/null
+++ b/prism/util/pm_string.h
@@ -0,0 +1,166 @@
+/**
+ * @file pm_string.h
+ *
+ * A generic string type that can have various ownership semantics.
+ */
+#ifndef PRISM_STRING_H
+#define PRISM_STRING_H
+
+#include "prism/defines.h"
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+
+// The following headers are necessary to read files using demand paging.
+#ifdef _WIN32
+#include <windows.h>
+#elif defined(_POSIX_MAPPED_FILES)
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#endif
+
+/**
+ * A generic string type that can have various ownership semantics.
+ */
+typedef struct {
+ /** A pointer to the start of the string. */
+ const uint8_t *source;
+
+ /** The length of the string in bytes of memory. */
+ size_t length;
+
+ /** The type of the string. This field determines how the string should be freed. */
+ enum {
+ /** This string is a constant string, and should not be freed. */
+ PM_STRING_CONSTANT,
+
+ /** This is a slice of another string, and should not be freed. */
+ PM_STRING_SHARED,
+
+ /** This string owns its memory, and should be freed using `pm_string_free`. */
+ PM_STRING_OWNED,
+
+#ifdef PRISM_HAS_MMAP
+ /** This string is a memory-mapped file, and should be freed using `pm_string_free`. */
+ PM_STRING_MAPPED
+#endif
+ } type;
+} pm_string_t;
+
+/**
+ * Returns the size of the pm_string_t struct. This is necessary to allocate the
+ * correct amount of memory in the FFI backend.
+ *
+ * @return The size of the pm_string_t struct.
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_string_sizeof(void);
+
+/**
+ * Defines an empty string. This is useful for initializing a string that will
+ * be filled in later.
+ */
+#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 })
+
+/**
+ * Initialize a shared string that is based on initial input.
+ *
+ * @param string The string to initialize.
+ * @param start The start of the string.
+ * @param end The end of the string.
+ */
+void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end);
+
+/**
+ * Initialize an owned string that is responsible for freeing allocated memory.
+ *
+ * @param string The string to initialize.
+ * @param source The source of the string.
+ * @param length The length of the string.
+ */
+void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length);
+
+/**
+ * Initialize a constant string that doesn't own its memory source.
+ *
+ * @param string The string to initialize.
+ * @param source The source of the string.
+ * @param length The length of the string.
+ */
+void pm_string_constant_init(pm_string_t *string, const char *source, size_t length);
+
+/**
+ * Read the file indicated by the filepath parameter into source and load its
+ * contents and size into the given `pm_string_t`. The given `pm_string_t`
+ * should be freed using `pm_string_free` when it is no longer used.
+ *
+ * We want to use demand paging as much as possible in order to avoid having to
+ * read the entire file into memory (which could be detrimental to performance
+ * for large files). This means that if we're on windows we'll use
+ * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use
+ * `mmap`, and on other POSIX systems we'll use `read`.
+ *
+ * @param string The string to initialize.
+ * @param filepath The filepath to read.
+ * @return Whether or not the file was successfully mapped.
+ */
+PRISM_EXPORTED_FUNCTION bool pm_string_mapped_init(pm_string_t *string, const char *filepath);
+
+/**
+ * Read the file indicated by the filepath parameter into source and load its
+ * contents and size into the given `pm_string_t`. The given `pm_string_t`
+ * should be freed using `pm_string_free` when it is no longer used.
+ *
+ * @param string The string to initialize.
+ * @param filepath The filepath to read.
+ * @return Whether or not the file was successfully read.
+ */
+PRISM_EXPORTED_FUNCTION bool pm_string_file_init(pm_string_t *string, const char *filepath);
+
+/**
+ * Ensure the string is owned. If it is not, then reinitialize it as owned and
+ * copy over the previous source.
+ *
+ * @param string The string to ensure is owned.
+ */
+void pm_string_ensure_owned(pm_string_t *string);
+
+/**
+ * Compare the underlying lengths and bytes of two strings. Returns 0 if the
+ * strings are equal, a negative number if the left string is less than the
+ * right string, and a positive number if the left string is greater than the
+ * right string.
+ *
+ * @param left The left string to compare.
+ * @param right The right string to compare.
+ * @return The comparison result.
+ */
+int pm_string_compare(const pm_string_t *left, const pm_string_t *right);
+
+/**
+ * Returns the length associated with the string.
+ *
+ * @param string The string to get the length of.
+ * @return The length of the string.
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_string_length(const pm_string_t *string);
+
+/**
+ * Returns the start pointer associated with the string.
+ *
+ * @param string The string to get the start pointer of.
+ * @return The start pointer of the string.
+ */
+PRISM_EXPORTED_FUNCTION const uint8_t * pm_string_source(const pm_string_t *string);
+
+/**
+ * Free the associated memory of the given string.
+ *
+ * @param string The string to free.
+ */
+PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string);
+
+#endif
diff --git a/prism/util/pm_strncasecmp.c b/prism/util/pm_strncasecmp.c
new file mode 100644
index 0000000000..2240bf8110
--- /dev/null
+++ b/prism/util/pm_strncasecmp.c
@@ -0,0 +1,24 @@
+#include "prism/util/pm_strncasecmp.h"
+
+/**
+ * Compare two strings, ignoring case, up to the given length. Returns 0 if the
+ * strings are equal, a negative number if string1 is less than string2, or a
+ * positive number if string1 is greater than string2.
+ *
+ * Note that this is effectively our own implementation of strncasecmp, but it's
+ * not available on all of the platforms we want to support so we're rolling it
+ * here.
+ */
+int
+pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length) {
+ size_t offset = 0;
+ int difference = 0;
+
+ while (offset < length && string1[offset] != '\0') {
+ if (string2[offset] == '\0') return string1[offset];
+ if ((difference = tolower(string1[offset]) - tolower(string2[offset])) != 0) return difference;
+ offset++;
+ }
+
+ return difference;
+}
diff --git a/prism/util/pm_strncasecmp.h b/prism/util/pm_strncasecmp.h
new file mode 100644
index 0000000000..5cb88cb5eb
--- /dev/null
+++ b/prism/util/pm_strncasecmp.h
@@ -0,0 +1,32 @@
+/**
+ * @file pm_strncasecmp.h
+ *
+ * A custom strncasecmp implementation.
+ */
+#ifndef PRISM_STRNCASECMP_H
+#define PRISM_STRNCASECMP_H
+
+#include "prism/defines.h"
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/**
+ * Compare two strings, ignoring case, up to the given length. Returns 0 if the
+ * strings are equal, a negative number if string1 is less than string2, or a
+ * positive number if string1 is greater than string2.
+ *
+ * Note that this is effectively our own implementation of strncasecmp, but it's
+ * not available on all of the platforms we want to support so we're rolling it
+ * here.
+ *
+ * @param string1 The first string to compare.
+ * @param string2 The second string to compare
+ * @param length The maximum number of characters to compare.
+ * @return 0 if the strings are equal, a negative number if string1 is less than
+ * string2, or a positive number if string1 is greater than string2.
+ */
+int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length);
+
+#endif
diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c
new file mode 100644
index 0000000000..916a4cc3fd
--- /dev/null
+++ b/prism/util/pm_strpbrk.c
@@ -0,0 +1,206 @@
+#include "prism/util/pm_strpbrk.h"
+
+/**
+ * Add an invalid multibyte character error to the parser.
+ */
+static inline void
+pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
+ pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
+}
+
+/**
+ * Set the explicit encoding for the parser to the current encoding.
+ */
+static inline void
+pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
+ if (parser->explicit_encoding != NULL) {
+ if (parser->explicit_encoding == parser->encoding) {
+ // Okay, we already locked to this encoding.
+ } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+ // Not okay, we already found a Unicode escape sequence and this
+ // conflicts.
+ pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+ } else {
+ // Should not be anything else.
+ assert(false && "unreachable");
+ }
+ }
+
+ parser->explicit_encoding = parser->encoding;
+}
+
+/**
+ * This is the default path.
+ */
+static inline const uint8_t *
+pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+ size_t index = 0;
+
+ while (index < maximum) {
+ if (strchr((const char *) charset, source[index]) != NULL) {
+ return source + index;
+ }
+
+ if (source[index] < 0x80) {
+ index++;
+ } else {
+ size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index));
+
+ if (width > 0) {
+ index += width;
+ } else if (!validate) {
+ index++;
+ } else {
+ // At this point we know we have an invalid multibyte character.
+ // We'll walk forward as far as we can until we find the next
+ // valid character so that we don't spam the user with a ton of
+ // the same kind of error.
+ const size_t start = index;
+
+ do {
+ index++;
+ } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * This is the path when the encoding is ASCII-8BIT.
+ */
+static inline const uint8_t *
+pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+ size_t index = 0;
+
+ while (index < maximum) {
+ if (strchr((const char *) charset, source[index]) != NULL) {
+ return source + index;
+ }
+
+ if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
+ index++;
+ }
+
+ return NULL;
+}
+
+/**
+ * This is the slow path that does care about the encoding.
+ */
+static inline const uint8_t *
+pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+ size_t index = 0;
+ const pm_encoding_t *encoding = parser->encoding;
+
+ while (index < maximum) {
+ if (strchr((const char *) charset, source[index]) != NULL) {
+ return source + index;
+ }
+
+ if (source[index] < 0x80) {
+ index++;
+ } else {
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+ if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);
+
+ if (width > 0) {
+ index += width;
+ } else if (!validate) {
+ index++;
+ } else {
+ // At this point we know we have an invalid multibyte character.
+ // We'll walk forward as far as we can until we find the next
+ // valid character so that we don't spam the user with a ton of
+ // the same kind of error.
+ const size_t start = index;
+
+ do {
+ index++;
+ } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * This is the fast path that does not care about the encoding because we know
+ * the encoding only supports single-byte characters.
+ */
+static inline const uint8_t *
+pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
+ size_t index = 0;
+ const pm_encoding_t *encoding = parser->encoding;
+
+ while (index < maximum) {
+ if (strchr((const char *) charset, source[index]) != NULL) {
+ return source + index;
+ }
+
+ if (source[index] < 0x80 || !validate) {
+ index++;
+ } else {
+ size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+ pm_strpbrk_explicit_encoding_set(parser, source, width);
+
+ if (width > 0) {
+ index += width;
+ } else {
+ // At this point we know we have an invalid multibyte character.
+ // We'll walk forward as far as we can until we find the next
+ // valid character so that we don't spam the user with a ton of
+ // the same kind of error.
+ const size_t start = index;
+
+ do {
+ index++;
+ } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+
+ pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
+ }
+ }
+ }
+
+ return NULL;
+}
+
+/**
+ * Here we have rolled our own version of strpbrk. The standard library strpbrk
+ * has undefined behavior when the source string is not null-terminated. We want
+ * to support strings that are not null-terminated because pm_parse does not
+ * have the contract that the string is null-terminated. (This is desirable
+ * because it means the extension can call pm_parse with the result of a call to
+ * mmap).
+ *
+ * The standard library strpbrk also does not support passing a maximum length
+ * to search. We want to support this for the reason mentioned above, but we
+ * also don't want it to stop on null bytes. Ruby actually allows null bytes
+ * within strings, comments, regular expressions, etc. So we need to be able to
+ * skip past them.
+ *
+ * Finally, we want to support encodings wherein the charset could contain
+ * characters that are trailing bytes of multi-byte characters. For example, in
+ * Shift_JIS, the backslash character can be a trailing byte. In that case we
+ * need to take a slower path and iterate one multi-byte character at a time.
+ */
+const uint8_t *
+pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) {
+ if (length <= 0) {
+ return NULL;
+ } else if (!parser->encoding_changed) {
+ return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
+ } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
+ return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
+ } else if (parser->encoding->multibyte) {
+ return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
+ } else {
+ return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate);
+ }
+}
diff --git a/prism/util/pm_strpbrk.h b/prism/util/pm_strpbrk.h
new file mode 100644
index 0000000000..f387bd5782
--- /dev/null
+++ b/prism/util/pm_strpbrk.h
@@ -0,0 +1,46 @@
+/**
+ * @file pm_strpbrk.h
+ *
+ * A custom strpbrk implementation.
+ */
+#ifndef PRISM_STRPBRK_H
+#define PRISM_STRPBRK_H
+
+#include "prism/defines.h"
+#include "prism/diagnostic.h"
+#include "prism/parser.h"
+
+#include <stddef.h>
+#include <string.h>
+
+/**
+ * Here we have rolled our own version of strpbrk. The standard library strpbrk
+ * has undefined behavior when the source string is not null-terminated. We want
+ * to support strings that are not null-terminated because pm_parse does not
+ * have the contract that the string is null-terminated. (This is desirable
+ * because it means the extension can call pm_parse with the result of a call to
+ * mmap).
+ *
+ * The standard library strpbrk also does not support passing a maximum length
+ * to search. We want to support this for the reason mentioned above, but we
+ * also don't want it to stop on null bytes. Ruby actually allows null bytes
+ * within strings, comments, regular expressions, etc. So we need to be able to
+ * skip past them.
+ *
+ * Finally, we want to support encodings wherein the charset could contain
+ * characters that are trailing bytes of multi-byte characters. For example, in
+ * Shift-JIS, the backslash character can be a trailing byte. In that case we
+ * need to take a slower path and iterate one multi-byte character at a time.
+ *
+ * @param parser The parser.
+ * @param source The source to search.
+ * @param charset The charset to search for.
+ * @param length The maximum number of bytes to search.
+ * @param validate Whether to validate that the source string is valid in the
+ * current encoding of the parser.
+ * @return A pointer to the first character in the source string that is in the
+ * charset, or NULL if no such character exists.
+ */
+const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
+
+#endif