26 files changed, 2672 insertions, 0 deletions
diff --git a/prism/internal/allocator.h b/prism/internal/allocator.h
new file mode 100644
index 0000000000..6c54010dbf
--- /dev/null
+++ b/prism/internal/allocator.h
@@ -0,0 +1,68 @@
+#ifndef PRISM_INTERNAL_ALLOCATOR_H
+#define PRISM_INTERNAL_ALLOCATOR_H
+
+/* If you build Prism with a custom allocator, configure it with
+ * "-D PRISM_XALLOCATOR" to use your own allocator that defines xmalloc,
+ * xrealloc, xcalloc, and xfree.
+ *
+ * For example, your `prism_xallocator.h` file could look like this:
+ *
+ * ```
+ * #ifndef PRISM_XALLOCATOR_H
+ * #define PRISM_XALLOCATOR_H
+ * #define xmalloc          my_malloc
+ * #define xrealloc         my_realloc
+ * #define xcalloc          my_calloc
+ * #define xfree            my_free
+ * #define xrealloc_sized   my_realloc_sized // (optional)
+ * #define xfree_sized      my_free_sized    // (optional)
+ * #endif
+ * ```
+ */
+#ifdef PRISM_XALLOCATOR
+    #include "prism_xallocator.h"
+#else
+    #ifndef xmalloc
+        /* The malloc function that should be used. This can be overridden with
+         * the PRISM_XALLOCATOR define. */
+        #define xmalloc malloc
+    #endif
+
+    #ifndef xrealloc
+        /* The realloc function that should be used. This can be overridden with
+         * the PRISM_XALLOCATOR define. */
+        #define xrealloc realloc
+    #endif
+
+    #ifndef xcalloc
+        /* The calloc function that should be used. This can be overridden with
+         * the PRISM_XALLOCATOR define. */
+        #define xcalloc calloc
+    #endif
+
+    #ifndef xfree
+        /* The free function that should be used. This can be overridden with
+         * the PRISM_XALLOCATOR define. */
+        #define xfree free
+    #endif
+#endif
+
+#ifndef xfree_sized
+    /* The free_sized function that should be used. This can be overridden with
+     * the PRISM_XALLOCATOR define. If not defined, defaults to calling xfree.
+     */
+    #define xfree_sized(p, s) xfree(((void)(s), (p)))
+#endif
+
+#ifndef xrealloc_sized
+    /* The xrealloc_sized function that should be used. This can be overridden
+     * with the PRISM_XALLOCATOR define. If not defined, defaults to calling
+     * xrealloc. */
+    #define xrealloc_sized(p, ns, os) xrealloc((p), ((void)(os), (ns)))
+#endif
+
+#ifdef PRISM_BUILD_DEBUG
+    #include "prism/internal/allocator_debug.h"
+#endif
+
+#endif
diff --git a/prism/internal/allocator_debug.h b/prism/internal/allocator_debug.h
new file mode 100644
index 0000000000..846e96ba2d
--- /dev/null
+++ b/prism/internal/allocator_debug.h
@@ -0,0 +1,88 @@
+#ifndef PRISM_INTERNAL_ALLOCATOR_DEBUG_H
+#define PRISM_INTERNAL_ALLOCATOR_DEBUG_H
+
+#include <string.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+static inline void *
+pm_allocator_debug_malloc(size_t size) {
+    size_t *memory = xmalloc(size + sizeof(size_t));
+    memory[0] = size;
+    return memory + 1;
+}
+
+static inline void *
+pm_allocator_debug_calloc(size_t nmemb, size_t size) {
+    size_t total_size = nmemb * size;
+    void *ptr = pm_allocator_debug_malloc(total_size);
+    memset(ptr, 0, total_size);
+    return ptr;
+}
+
+static inline void *
+pm_allocator_debug_realloc(void *ptr, size_t size) {
+    if (ptr == NULL) {
+        return pm_allocator_debug_malloc(size);
+    }
+
+    size_t *memory = (size_t *)ptr;
+    void *raw_memory = memory - 1;
+    memory = (size_t *)xrealloc(raw_memory, size + sizeof(size_t));
+    memory[0] = size;
+    return memory + 1;
+}
+
+static inline void
+pm_allocator_debug_free(void *ptr) {
+    if (ptr != NULL) {
+        size_t *memory = (size_t *)ptr;
+        xfree(memory - 1);
+    }
+}
+
+static inline void
+pm_allocator_debug_free_sized(void *ptr, size_t old_size) {
+    if (ptr != NULL) {
+        size_t *memory = (size_t *)ptr;
+        if (old_size != memory[-1]) {
+            fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but freed with size %lu\n", ptr, memory[-1], old_size);
+            abort();
+        }
+        xfree_sized(memory - 1, old_size + sizeof(size_t));
+    }
+}
+
+static inline void *
+pm_allocator_debug_realloc_sized(void *ptr, size_t size, size_t old_size) {
+    if (ptr == NULL) {
+        if (old_size != 0) {
+            fprintf(stderr, "[BUG] realloc_sized called with NULL pointer and old size %lu\n", old_size);
+            abort();
+        }
+        return pm_allocator_debug_malloc(size);
+    }
+
+    size_t *memory = (size_t *)ptr;
+    if (old_size != memory[-1]) {
+        fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but realloced with size %lu\n", ptr, memory[-1], old_size);
+        abort();
+    }
+    return pm_allocator_debug_realloc(ptr, size);
+}
+
+#undef xmalloc
+#undef xrealloc
+#undef xcalloc
+#undef xfree
+#undef xrealloc_sized
+#undef xfree_sized
+
+#define xmalloc          pm_allocator_debug_malloc
+#define xrealloc         pm_allocator_debug_realloc
+#define xcalloc          pm_allocator_debug_calloc
+#define xfree            pm_allocator_debug_free
+#define xrealloc_sized   pm_allocator_debug_realloc_sized
+#define xfree_sized      pm_allocator_debug_free_sized
+
+#endif
diff --git a/prism/internal/arena.h b/prism/internal/arena.h
new file mode 100644
index 0000000000..2e413b42bf
--- /dev/null
+++ b/prism/internal/arena.h
@@ -0,0 +1,108 @@
+#ifndef PRISM_INTERNAL_ARENA_H
+#define PRISM_INTERNAL_ARENA_H
+
+#include "prism/compiler/exported.h"
+#include "prism/compiler/flex_array.h"
+#include "prism/compiler/force_inline.h"
+#include "prism/compiler/inline.h"
+
+#include "prism/arena.h"
+
+#include <stddef.h>
+#include <string.h>
+
+/*
+ * A single block of memory in the arena. Blocks are linked via prev pointers so
+ * they can be freed by walking the chain.
+ */
+typedef struct pm_arena_block {
+    /* The previous block in the chain (for freeing). */
+    struct pm_arena_block *prev;
+
+    /* The total usable bytes in data[]. */
+    size_t capacity;
+
+    /* The number of bytes consumed so far. */
+    size_t used;
+
+    /* The block's data. */
+    char data[PM_FLEX_ARRAY_LENGTH];
+} pm_arena_block_t;
+
+/*
+ * A bump allocator. Allocations are made by bumping a pointer within the
+ * current block. When a block is full, a new block is allocated and linked to
+ * the previous one. All blocks are freed at once by walking the chain.
+ */
+struct pm_arena_t {
+    /* The active block (allocate from here). */
+    pm_arena_block_t *current;
+
+    /* The number of blocks allocated. */
+    size_t block_count;
+};
+
+/*
+ * Free all blocks in the arena. After this call, all pointers returned by
+ * pm_arena_alloc and pm_arena_zalloc are invalid.
+ */
+void pm_arena_cleanup(pm_arena_t *arena);
+
+/*
+ * Ensure the arena has at least `capacity` bytes available in its current
+ * block, allocating a new block if necessary. This allows callers to
+ * pre-size the arena to avoid repeated small block allocations.
+ */
+void pm_arena_reserve(pm_arena_t *arena, size_t capacity);
+
+/*
+ * Slow path for pm_arena_alloc: allocate a new block and return a pointer to
+ * the first `size` bytes. Do not call directly — use pm_arena_alloc instead.
+ */
+void * pm_arena_alloc_slow(pm_arena_t *arena, size_t size);
+
+/*
+ * Allocate memory from the arena. The returned memory is NOT zeroed. This
+ * function is infallible — it aborts on allocation failure.
+ *
+ * The fast path (bump pointer within the current block) is inlined at each
+ * call site. The slow path (new block allocation) is out-of-line.
+ */
+static PRISM_FORCE_INLINE void *
+pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment) {
+    if (arena->current != NULL) {
+        size_t used_aligned = (arena->current->used + alignment - 1) & ~(alignment - 1);
+        size_t needed = used_aligned + size;
+
+        if (used_aligned >= arena->current->used && needed >= used_aligned && needed <= arena->current->capacity) {
+            arena->current->used = needed;
+            return arena->current->data + used_aligned;
+        }
+    }
+
+    return pm_arena_alloc_slow(arena, size);
+}
+
+/*
+ * Allocate zero-initialized memory from the arena. This function is infallible
+ * — it aborts on allocation failure.
+ */
+static PRISM_INLINE void *
+pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment) {
+    void *ptr = pm_arena_alloc(arena, size, alignment);
+    memset(ptr, 0, size);
+    return ptr;
+}
+
+/*
+ * Allocate memory from the arena and copy the given data into it. This is a
+ * convenience wrapper around pm_arena_alloc + memcpy.
+ */
+static PRISM_INLINE void *
+pm_arena_memdup(pm_arena_t *arena, const void *src, size_t size, size_t alignment) {
+    void *dst = pm_arena_alloc(arena, size, alignment);
+    memcpy(dst, src, size);
+    return dst;
+}
+
+#endif
diff --git a/prism/internal/bit.h b/prism/internal/bit.h
new file mode 100644
index 0000000000..b0111a4c2c
--- /dev/null
+++ b/prism/internal/bit.h
@@ -0,0 +1,42 @@
+#ifndef PRISM_INTERNAL_BIT_H
+#define PRISM_INTERNAL_BIT_H
+
+#include "prism/compiler/inline.h"
+
+/*
+ * Count trailing zero bits in a 64-bit value. Used by SWAR identifier scanning
+ * to find the first non-matching byte in a word.
+ *
+ * Precondition: v must be nonzero. The result is undefined when v == 0
+ * (matching the behavior of __builtin_ctzll and _BitScanForward64).
+ */
+#if defined(__GNUC__) || defined(__clang__)
+#define pm_ctzll(v) ((unsigned) __builtin_ctzll(v))
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#include <stdint.h>
+
+static PRISM_INLINE unsigned
+pm_ctzll(uint64_t v) {
+    unsigned long index;
+    _BitScanForward64(&index, v);
+    return (unsigned) index;
+}
+#else
+#include <stdint.h>
+
+static PRISM_INLINE unsigned
+pm_ctzll(uint64_t v) {
+    unsigned c = 0;
+    v &= (uint64_t) (-(int64_t) v);
+    if (v & 0x00000000FFFFFFFFULL) c += 0;  else c += 32;
+    if (v & 0x0000FFFF0000FFFFULL) c += 0;  else c += 16;
+    if (v & 0x00FF00FF00FF00FFULL) c += 0;  else c += 8;
+    if (v & 0x0F0F0F0F0F0F0F0FULL) c += 0;  else c += 4;
+    if (v & 0x3333333333333333ULL) c += 0;  else c += 2;
+    if (v & 0x5555555555555555ULL) c += 0;  else c += 1;
+    return c;
+}
+#endif
+
+#endif
diff --git a/prism/internal/buffer.h b/prism/internal/buffer.h
new file mode 100644
index 0000000000..a849bbf8e6
--- /dev/null
+++ b/prism/internal/buffer.h
@@ -0,0 +1,91 @@
+#ifndef PRISM_INTERNAL_BUFFER_H
+#define PRISM_INTERNAL_BUFFER_H
+
+#include "prism/compiler/format.h"
+
+#include "prism/buffer.h"
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/*
+ * A simple memory buffer that stores data in a contiguous block of memory.
+ */
+struct pm_buffer_t {
+    /* The length of the buffer in bytes. */
+    size_t length;
+
+    /* The capacity of the buffer in bytes that has been allocated. */
+    size_t capacity;
+
+    /* A pointer to the start of the buffer. */
+    char *value;
+};
+
+/* Initialize a pm_buffer_t with the given capacity. */
+void pm_buffer_init(pm_buffer_t *buffer, size_t capacity);
+
+/* Free the memory held by the buffer. */
+void pm_buffer_cleanup(pm_buffer_t *buffer);
+
+/* Append the given amount of space as zeroes to the buffer. */
+void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length);
+
+/* Append a formatted string to the buffer. */
+void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3);
+
+/* Append a string to the buffer. */
+void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length);
+
+/* Append a list of bytes to the buffer. */
+void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length);
+
+/* Append a single byte to the buffer. */
+void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value);
+
+/* Append a 32-bit unsigned integer to the buffer as a variable-length integer. */
+void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value);
+
+/* Append a 32-bit signed integer to the buffer as a variable-length integer. */
+void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value);
+
+/* Append a double to the buffer. */
+void pm_buffer_append_double(pm_buffer_t *buffer, double value);
+
+/* Append a unicode codepoint to the buffer. */
+bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value);
+
+/*
+ * The different types of escaping that can be performed by the buffer when
+ * appending a slice of Ruby source code.
+ */
+typedef enum {
+    PM_BUFFER_ESCAPING_RUBY,
+    PM_BUFFER_ESCAPING_JSON
+} pm_buffer_escaping_t;
+
+/* Append a slice of source code to the buffer. */
+void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping);
+
+/* Prepend the given string to the buffer. */
+void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length);
+
+/* Concatenate one buffer onto another. */
+void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source);
+
+/*
+ * Clear the buffer by reducing its size to 0. This does not free the allocated
+ * memory, but it does allow the buffer to be reused.
+ */
+void pm_buffer_clear(pm_buffer_t *buffer);
+
+/* Strip the whitespace from the end of the buffer. */
+void pm_buffer_rstrip(pm_buffer_t *buffer);
+
+/* Checks if the buffer includes the given value. */
+size_t pm_buffer_index(const pm_buffer_t *buffer, char value);
+
+/* Insert the given string into the buffer at the given index. */
+void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length);
+
+#endif
diff --git a/prism/internal/char.h b/prism/internal/char.h
new file mode 100644
index 0000000000..9a58fba8c5
--- /dev/null
+++ b/prism/internal/char.h
@@ -0,0 +1,139 @@
+#ifndef PRISM_INTERNAL_CHAR_H
+#define PRISM_INTERNAL_CHAR_H
+
+#include "prism/compiler/force_inline.h"
+
+#include "prism/arena.h"
+#include "prism/line_offset_list.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/* Bit flag for whitespace characters in pm_byte_table. */
+#define PRISM_CHAR_BIT_WHITESPACE (1 << 0)
+
+/* Bit flag for inline whitespace characters in pm_byte_table. */
+#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1)
+
+/*
+ * A lookup table for classifying bytes. Each entry is a bitfield of
+ * PRISM_CHAR_BIT_* flags. Defined in char.c.
+ */
+extern const uint8_t pm_byte_table[256];
+
+/* Returns true if the given character is a whitespace character. */
+static PRISM_FORCE_INLINE bool
+pm_char_is_whitespace(const uint8_t b) {
+    return (pm_byte_table[b] & PRISM_CHAR_BIT_WHITESPACE) != 0;
+}
+
+/* Returns true if the given character is an inline whitespace character. */
+static PRISM_FORCE_INLINE bool
+pm_char_is_inline_whitespace(const uint8_t b) {
+    return (pm_byte_table[b] & PRISM_CHAR_BIT_INLINE_WHITESPACE) != 0;
+}
+
+/*
+ * Returns the number of characters at the start of the string that are inline
+ * whitespace (space/tab). Scans the byte table directly for use in hot paths.
+ */
+static PRISM_FORCE_INLINE size_t
+pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) {
+    if (length <= 0) return 0;
+    size_t size = 0;
+    size_t maximum = (size_t) length;
+    while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_INLINE_WHITESPACE)) size++;
+    return size;
+}
+
+/*
+ * Returns the number of characters at the start of the string that are
+ * whitespace. Disallows searching past the given maximum number of characters.
+ */
+size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length);
+
+/*
+ * Returns the number of characters at the start of the string that are
+ * whitespace while also tracking the location of each newline. Disallows
+ * searching past the given maximum number of characters.
+ */
+size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_arena_t *arena, pm_line_offset_list_t *line_offsets, uint32_t start_offset);
+
+/*
+ * Returns the number of characters at the start of the string that are decimal
+ * digits. Disallows searching past the given maximum number of characters.
+ */
+size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length);
+
+/*
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits. Disallows searching past the given maximum number of
+ * characters.
+ */
+size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length);
+
+/*
+ * Returns the number of characters at the start of the string that are octal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/*
+ * Returns the number of characters at the start of the string that are decimal
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/*
+ * Returns the number of characters at the start of the string that are
+ * hexadecimal digits or underscores. Disallows searching past the given maximum
+ * number of characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+/*
+ * Returns the number of characters at the start of the string that are regexp
+ * options. Disallows searching past the given maximum number of characters.
+ */
+size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length);
+
+/*
+ * Returns the number of characters at the start of the string that are binary
+ * digits or underscores. Disallows searching past the given maximum number of
+ * characters.
+ *
+ * If multiple underscores are found in a row or if an underscore is
+ * found at the end of the number, then the invalid pointer is set to the index
+ * of the first invalid underscore.
+ */
+size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid);
+
+
+/* Returns true if the given character is a binary digit. */
+bool pm_char_is_binary_digit(const uint8_t b);
+
+/* Returns true if the given character is an octal digit. */
+bool pm_char_is_octal_digit(const uint8_t b);
+
+/* Returns true if the given character is a decimal digit. */
+bool pm_char_is_decimal_digit(const uint8_t b);
+
+/* Returns true if the given character is a hexadecimal digit. */
+bool pm_char_is_hexadecimal_digit(const uint8_t b);
+
+#endif
diff --git a/prism/internal/comments.h b/prism/internal/comments.h
new file mode 100644
index 0000000000..bb3039a658
--- /dev/null
+++ b/prism/internal/comments.h
@@ -0,0 +1,20 @@
+#ifndef PRISM_INTERNAL_COMMENTS_H
+#define PRISM_INTERNAL_COMMENTS_H
+
+#include "prism/comments.h"
+
+#include "prism/internal/list.h"
+
+/* A comment found while parsing. */
+struct pm_comment_t {
+    /* The embedded base node. */
+    pm_list_node_t node;
+
+    /* The location of the comment in the source. */
+    pm_location_t location;
+
+    /* The type of the comment. */
+    pm_comment_type_t type;
+};
+
+#endif
diff --git a/prism/internal/constant_pool.h b/prism/internal/constant_pool.h
new file mode 100644
index 0000000000..fa2be783f5
--- /dev/null
+++ b/prism/internal/constant_pool.h
@@ -0,0 +1,117 @@
+#ifndef PRISM_INTERNAL_CONSTANT_POOL_H
+#define PRISM_INTERNAL_CONSTANT_POOL_H
+
+#include "prism/constant_pool.h"
+
+#include "prism/arena.h"
+
+#include <stdbool.h>
+
+/* A constant in the pool which effectively stores a string. */
+struct pm_constant_t {
+    /* A pointer to the start of the string. */
+    const uint8_t *start;
+
+    /* The length of the string. */
+    size_t length;
+};
+
+/*
+ * The type of bucket in the constant pool hash map. This determines how the
+ * bucket should be freed.
+ */
+typedef unsigned int pm_constant_pool_bucket_type_t;
+
+/* By default, each constant is a slice of the source. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0;
+
+/* An owned constant is one for which memory has been allocated. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1;
+
+/* A constant constant is known at compile time. */
+static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2;
+
+/* A bucket in the hash map. */
+typedef struct {
+    /* The incremental ID used for indexing back into the pool. */
+    unsigned int id: 30;
+
+    /* The type of the bucket, which determines how to free it. */
+    pm_constant_pool_bucket_type_t type: 2;
+
+    /* The hash of the bucket. */
+    uint32_t hash;
+
+    /*
+     * A pointer to the start of the string, stored directly in the bucket to
+     * avoid a pointer chase to the constants array during probing.
+     */
+    const uint8_t *start;
+
+    /* The length of the string. */
+    size_t length;
+} pm_constant_pool_bucket_t;
+
+/* The overall constant pool, which stores constants found while parsing. */
+struct pm_constant_pool_t {
+    /* The buckets in the hash map. */
+    pm_constant_pool_bucket_t *buckets;
+
+    /* The constants that are stored in the buckets. */
+    pm_constant_t *constants;
+
+    /* The number of buckets in the hash map. */
+    uint32_t size;
+
+    /* The number of buckets that have been allocated in the hash map. */
+    uint32_t capacity;
+};
+
+/*
+ * When we allocate constants into the pool, we reserve 0 to mean that the slot
+ * is not yet filled. This constant is reused in other places to indicate the
+ * lack of a constant id.
+ */
+#define PM_CONSTANT_ID_UNSET 0
+
+/* Initialize a list of constant ids with a given capacity. */
+void pm_constant_id_list_init_capacity(pm_arena_t *arena, pm_constant_id_list_t *list, size_t capacity);
+
+/* Insert a constant id into a list of constant ids at the specified index. */
+void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id);
+
+/* Checks if the current constant id list includes the given constant id. */
+bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id);
+
+/* Initialize a new constant pool with a given capacity. */
+void pm_constant_pool_init(pm_arena_t *arena, pm_constant_pool_t *pool, uint32_t capacity);
+
+/* Return a pointer to the constant indicated by the given constant id. */
+pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id);
+
+/*
+ * Find a constant in a constant pool. Returns the id of the constant, or 0 if
+ * the constant is not found.
+ */
+pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+/*
+ * Insert a constant into a constant pool that is a slice of a source string.
+ * Returns the id of the constant, or 0 if any potential calls to resize fail.
+ */
+pm_constant_id_t pm_constant_pool_insert_shared(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+/*
+ * Insert a constant into a constant pool from memory that is now owned by the
+ * constant pool. Returns the id of the constant, or 0 if any potential calls to
+ * resize fail.
+ */
+pm_constant_id_t pm_constant_pool_insert_owned(pm_arena_t *arena, pm_constant_pool_t *pool, uint8_t *start, size_t length);
+
+/*
+ * Insert a constant into a constant pool from memory that is constant. Returns
+ * the id of the constant, or 0 if any potential calls to resize fail.
+ */
+pm_constant_id_t pm_constant_pool_insert_constant(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length);
+
+#endif
diff --git a/prism/internal/encoding.h b/prism/internal/encoding.h
new file mode 100644
index 0000000000..62392ef970
--- /dev/null
+++ b/prism/internal/encoding.h
@@ -0,0 +1,242 @@
+#ifndef PRISM_INTERNAL_ENCODING_H
+#define PRISM_INTERNAL_ENCODING_H
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * This struct defines the functions necessary to implement the encoding
+ * interface so we can determine how many bytes the subsequent character takes.
+ * Each callback should return the number of bytes, or 0 if the next bytes are
+ * invalid for the encoding and type.
+ */
+typedef struct {
+    /*
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding. Does not read more than n bytes. It is assumed that n is
+     * at least 1.
+     */
+    size_t (*char_width)(const uint8_t *b, ptrdiff_t n);
+
+    /*
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphabetical. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
+    size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n);
+
+    /*
+     * Return the number of bytes that the next character takes if it is valid
+     * in the encoding and is alphanumeric. Does not read more than n bytes. It
+     * is assumed that n is at least 1.
+     */
+    size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n);
+
+    /*
+     * Return true if the next character is valid in the encoding and is an
+     * uppercase character. Does not read more than n bytes. It is assumed that
+     * n is at least 1.
+     */
+    bool (*isupper_char)(const uint8_t *b, ptrdiff_t n);
+
+    /*
+     * The name of the encoding. This should correspond to a value that can be
+     * passed to Encoding.find in Ruby.
+     */
+    const char *name;
+
+    /* Return true if the encoding is a multibyte encoding. */
+    bool multibyte;
+} pm_encoding_t;
+
+/*
+ * All of the lookup tables use the first bit of each embedded byte to indicate
+ * whether the codepoint is alphabetical.
+ */
+#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0
+
+/*
+ * All of the lookup tables use the second bit of each embedded byte to indicate
+ * whether the codepoint is alphanumeric.
+ */
+#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1
+
+/*
+ * All of the lookup tables use the third bit of each embedded byte to indicate
+ * whether the codepoint is uppercase.
+ */
+#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2
+
+/* Return the size of the next character in the UTF-8 encoding. */
+size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n);
+
+/*
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphabetical character.
+ */
+size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n);
+
+/*
+ * Return the size of the next character in the UTF-8 encoding if it is an
+ * alphanumeric character.
+ */
+size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n);
+
+/*
+ * Return true if the next character in the UTF-8 encoding if it is an uppercase
+ * character.
+ */
+bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n);
+
+/*
+ * This lookup table is referenced in both the UTF-8 encoding file and the
+ * parser directly in order to speed up the default encoding processing. It is
+ * used to indicate whether a character is alphabetical, alphanumeric, or
+ * uppercase in unicode mappings.
+ */
+extern const uint8_t pm_encoding_unicode_table[256];
+
+/* These are all of the encodings that prism supports. */
+typedef enum {
+    PM_ENCODING_UTF_8 = 0,
+    PM_ENCODING_US_ASCII,
+    PM_ENCODING_ASCII_8BIT,
+    PM_ENCODING_EUC_JP,
+    PM_ENCODING_WINDOWS_31J,
+
+/* We optionally support excluding the full set of encodings to only support the
+ * minimum necessary to process Ruby code without encoding comments. */
+#ifndef PRISM_ENCODING_EXCLUDE_FULL
+    PM_ENCODING_BIG5,
+    PM_ENCODING_BIG5_HKSCS,
+    PM_ENCODING_BIG5_UAO,
+    PM_ENCODING_CESU_8,
+    PM_ENCODING_CP51932,
+    PM_ENCODING_CP850,
+    PM_ENCODING_CP852,
+    PM_ENCODING_CP855,
+    PM_ENCODING_CP949,
+    PM_ENCODING_CP950,
+    PM_ENCODING_CP951,
+    PM_ENCODING_EMACS_MULE,
+    PM_ENCODING_EUC_JP_MS,
+    PM_ENCODING_EUC_JIS_2004,
+    PM_ENCODING_EUC_KR,
+    PM_ENCODING_EUC_TW,
+    PM_ENCODING_GB12345,
+    PM_ENCODING_GB18030,
+    PM_ENCODING_GB1988,
+    PM_ENCODING_GB2312,
+    PM_ENCODING_GBK,
+    PM_ENCODING_IBM437,
+    PM_ENCODING_IBM720,
+    PM_ENCODING_IBM737,
+    PM_ENCODING_IBM775,
+    PM_ENCODING_IBM852,
+    PM_ENCODING_IBM855,
+    PM_ENCODING_IBM857,
+    PM_ENCODING_IBM860,
+    PM_ENCODING_IBM861,
+    PM_ENCODING_IBM862,
+    PM_ENCODING_IBM863,
+    PM_ENCODING_IBM864,
+    PM_ENCODING_IBM865,
+    PM_ENCODING_IBM866,
+    PM_ENCODING_IBM869,
+    PM_ENCODING_ISO_8859_1,
+    PM_ENCODING_ISO_8859_2,
+    PM_ENCODING_ISO_8859_3,
+    PM_ENCODING_ISO_8859_4,
+    PM_ENCODING_ISO_8859_5,
+    PM_ENCODING_ISO_8859_6,
+    PM_ENCODING_ISO_8859_7,
+    PM_ENCODING_ISO_8859_8,
+    PM_ENCODING_ISO_8859_9,
+    PM_ENCODING_ISO_8859_10,
+    PM_ENCODING_ISO_8859_11,
+    PM_ENCODING_ISO_8859_13,
+    PM_ENCODING_ISO_8859_14,
+    PM_ENCODING_ISO_8859_15,
+    PM_ENCODING_ISO_8859_16,
+    PM_ENCODING_KOI8_R,
+    PM_ENCODING_KOI8_U,
+    PM_ENCODING_MAC_CENT_EURO,
+    PM_ENCODING_MAC_CROATIAN,
+    PM_ENCODING_MAC_CYRILLIC,
+    PM_ENCODING_MAC_GREEK,
+    PM_ENCODING_MAC_ICELAND,
+    PM_ENCODING_MAC_JAPANESE,
+    PM_ENCODING_MAC_ROMAN,
+    PM_ENCODING_MAC_ROMANIA,
+    PM_ENCODING_MAC_THAI,
+    PM_ENCODING_MAC_TURKISH,
+    PM_ENCODING_MAC_UKRAINE,
+    PM_ENCODING_SHIFT_JIS,
+    PM_ENCODING_SJIS_DOCOMO,
+    PM_ENCODING_SJIS_KDDI,
+    PM_ENCODING_SJIS_SOFTBANK,
+    PM_ENCODING_STATELESS_ISO_2022_JP,
+    PM_ENCODING_STATELESS_ISO_2022_JP_KDDI,
+    PM_ENCODING_TIS_620,
+    PM_ENCODING_UTF8_MAC,
+    PM_ENCODING_UTF8_DOCOMO,
+    PM_ENCODING_UTF8_KDDI,
+    PM_ENCODING_UTF8_SOFTBANK,
+    PM_ENCODING_WINDOWS_1250,
+    PM_ENCODING_WINDOWS_1251,
+    PM_ENCODING_WINDOWS_1252,
+    PM_ENCODING_WINDOWS_1253,
+    PM_ENCODING_WINDOWS_1254,
+    PM_ENCODING_WINDOWS_1255,
+    PM_ENCODING_WINDOWS_1256,
+    PM_ENCODING_WINDOWS_1257,
+    PM_ENCODING_WINDOWS_1258,
+    PM_ENCODING_WINDOWS_874,
+#endif
+
+    PM_ENCODING_MAXIMUM
+} pm_encoding_type_t;
+
+/* This is the table of all of the encodings that prism supports. */
+extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM];
+
+/*
+ * This is the default UTF-8 encoding. We need a reference to it to quickly
+ * create parsers.
+ */
+#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8])
+
+/*
+ * This is the US-ASCII encoding. We need a reference to it to be able to
+ * compare against it when a string is being created because it could possibly
+ * need to fall back to ASCII-8BIT.
+ */
+#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII])
+
+/*
+ * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk
+ * can compare against it because invalid multibyte characters are not a thing
+ * in this encoding. It is also needed for handling Regexp encoding flags.
+ */
+#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT])
+
+/*
+ * This is the EUC-JP encoding. We need a reference to it to quickly process
+ * regular expression modifiers.
+ */
+#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP])
+
+/*
+ * This is the Windows-31J encoding. We need a reference to it to quickly
+ * process regular expression modifiers.
+ */
+#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J])
+
+/*
+ * Parse the given name of an encoding and return a pointer to the corresponding
+ * encoding struct if one can be found, otherwise return NULL.
+ */
+const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end);
+
+#endif
diff --git a/prism/internal/integer.h b/prism/internal/integer.h
new file mode 100644
index 0000000000..7c9767e323
--- /dev/null
+++ b/prism/internal/integer.h
@@ -0,0 +1,68 @@
+/*
+ * This module provides functions for working with arbitrary-sized integers.
+ */
+#ifndef PRISM_INTERNAL_INTEGER_H
+#define PRISM_INTERNAL_INTEGER_H
+
+#include "prism/buffer.h"
+#include "prism/integer.h"
+
+#include <stdint.h>
+
+/*
+ * An enum controlling the base of an integer. It is expected that the base is
+ * already known before parsing the integer, even though it could be derived
+ * from the string itself.
+ */
+typedef enum {
+    /* The default decimal base, with no prefix. Leading 0s will be ignored. */
+    PM_INTEGER_BASE_DEFAULT,
+
+    /* The binary base, indicated by a 0b or 0B prefix. */
+    PM_INTEGER_BASE_BINARY,
+
+    /* The octal base, indicated by a 0, 0o, or 0O prefix. */
+    PM_INTEGER_BASE_OCTAL,
+
+    /* The decimal base, indicated by a 0d, 0D, or empty prefix. */
+    PM_INTEGER_BASE_DECIMAL,
+
+    /* The hexadecimal base, indicated by a 0x or 0X prefix. */
+    PM_INTEGER_BASE_HEXADECIMAL,
+
+    /*
+     * An unknown base, in which case pm_integer_parse will derive it based on
+     * the content of the string. This is less efficient and does more
+     * comparisons, so if callers know the base ahead of time, they should use
+     * that instead.
+     */
+    PM_INTEGER_BASE_UNKNOWN
+} pm_integer_base_t;
+
+/*
+ * Parse an integer from a string. This assumes that the format of the integer
+ * has already been validated, as internal validation checks are not performed
+ * here.
+ */
+void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end);
+
+/*
+ * Compare two integers. This function returns -1 if the left integer is less
+ * than the right integer, 0 if they are equal, and 1 if the left integer is
+ * greater than the right integer.
+ */
+int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right);
+
+/*
+ * Reduce a ratio of integers to its simplest form.
+ *
+ * If either the numerator or denominator do not fit into a 32-bit integer, then
+ * this function is a no-op. In the future, we may consider reducing even the
+ * larger numbers, but for now we're going to keep it simple.
+ */
+void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator);
+
+/* Convert an integer to a decimal string. */
+void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer);
+
+#endif
diff --git a/prism/internal/isinf.h b/prism/internal/isinf.h
new file mode 100644
index 0000000000..41c160f56d
--- /dev/null
+++ b/prism/internal/isinf.h
@@ -0,0 +1,16 @@
+#ifndef PRISM_INTERNAL_ISINF_H
+#define PRISM_INTERNAL_ISINF_H
+
+/*
+ * isinf on POSIX systems accepts a float, a double, or a long double. But mingw
+ * didn't provide an isinf macro, only an isinf function that only accepts
+ * floats, so we need to use _finite instead.
+ */
+#ifdef __MINGW64__
+    #include <float.h>
+    #define PRISM_ISINF(x) (!_finite(x))
+#else
+    #define PRISM_ISINF(x) isinf(x)
+#endif
+
+#endif
diff --git a/prism/internal/line_offset_list.h b/prism/internal/line_offset_list.h
new file mode 100644
index 0000000000..dac9f7052e
--- /dev/null
+++ b/prism/internal/line_offset_list.h
@@ -0,0 +1,34 @@
+#ifndef PRISM_INTERNAL_LINE_OFFSET_LIST_H
+#define PRISM_INTERNAL_LINE_OFFSET_LIST_H
+
+#include "prism/compiler/force_inline.h"
+
+#include "prism/arena.h"
+#include "prism/line_offset_list.h"
+
+/* Initialize a new line offset list with the given capacity. */
+void pm_line_offset_list_init(pm_arena_t *arena, pm_line_offset_list_t *list, size_t capacity);
+
+/* Clear out the offsets that have been appended to the list. */
+void pm_line_offset_list_clear(pm_line_offset_list_t *list);
+
+/* Append a new offset to the list (slow path with resize). */
+void pm_line_offset_list_append_slow(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor);
+
+/* Append a new offset to the list. */
+static PRISM_FORCE_INLINE void
+pm_line_offset_list_append(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor) {
+    if (list->size < list->capacity) {
+        list->offsets[list->size++] = cursor;
+    } else {
+        pm_line_offset_list_append_slow(arena, list, cursor);
+    }
+}
+
+/*
+ * Returns the line of the given offset. If the offset is not in the list, the
+ * line of the closest offset less than the given offset is returned.
+ */
+int32_t pm_line_offset_list_line(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line);
+
+#endif
diff --git a/prism/internal/list.h b/prism/internal/list.h
new file mode 100644
index 0000000000..0ab59ef32a
--- /dev/null
+++ b/prism/internal/list.h
@@ -0,0 +1,62 @@
+#ifndef PRISM_INTERNAL_LIST_H
+#define PRISM_INTERNAL_LIST_H
+
+#include <stddef.h>
+
+/*
+ * This struct represents an abstract linked list that provides common
+ * functionality. It is meant to be used any time a linked list is necessary to
+ * store data.
+ *
+ * The linked list itself operates off a set of pointers. Because the pointers
+ * are not necessarily sequential, they can be of any size. We use this fact to
+ * allow the consumer of this linked list to extend the node struct to include
+ * any data they want. This is done by using the pm_list_node_t as the first
+ * member of the struct.
+ *
+ * For example, if we want to store a list of integers, we can do the following:
+ *
+ * ```c
+ * typedef struct {
+ *     pm_list_node_t node;
+ *     int value;
+ * } pm_int_node_t;
+ *
+ * pm_list_t list = { 0 };
+ * pm_int_node_t *node = xmalloc(sizeof(pm_int_node_t));
+ * node->value = 5;
+ *
+ * pm_list_append(&list, &node->node);
+ * ```
+ *
+ * The pm_list_t struct is used to represent the overall linked list. It
+ * contains a pointer to the head and tail of the list. This allows for easy
+ * iteration and appending of new nodes.
+ */
+typedef struct pm_list_node {
+    /* A pointer to the next node in the list. */
+    struct pm_list_node *next;
+} pm_list_node_t;
+
+/*
+ * This represents the overall linked list. It keeps a pointer to the head and
+ * tail so that iteration is easy and pushing new nodes is easy.
+ */
+typedef struct {
+    /* The size of the list. */
+    size_t size;
+
+    /* A pointer to the head of the list. */
+    pm_list_node_t *head;
+
+    /* A pointer to the tail of the list. */
+    pm_list_node_t *tail;
+} pm_list_t;
+
+/* Returns the size of the list. */
+size_t pm_list_size(pm_list_t *list);
+
+/* Append a node to the given list. */
+void pm_list_append(pm_list_t *list, pm_list_node_t *node);
+
+#endif
diff --git a/prism/internal/magic_comments.h b/prism/internal/magic_comments.h
new file mode 100644
index 0000000000..72a581c5d7
--- /dev/null
+++ b/prism/internal/magic_comments.h
@@ -0,0 +1,23 @@
+#ifndef PRISM_INTERNAL_MAGIC_COMMENTS_H
+#define PRISM_INTERNAL_MAGIC_COMMENTS_H
+
+#include "prism/magic_comments.h"
+
+#include "prism/internal/list.h"
+
+/*
+ * This is a node in the linked list of magic comments that we've found while
+ * parsing.
+ */
+struct pm_magic_comment_t {
+    /* The embedded base node. */
+    pm_list_node_t node;
+
+    /* The key of the magic comment. */
+    pm_location_t key;
+
+    /* The value of the magic comment. */
+    pm_location_t value;
+};
+
+#endif
diff --git a/prism/internal/memchr.h b/prism/internal/memchr.h
new file mode 100644
index 0000000000..6f6b0bca30
--- /dev/null
+++ b/prism/internal/memchr.h
@@ -0,0 +1,15 @@
+#ifndef PRISM_INTERNAL_MEMCHR_H
+#define PRISM_INTERNAL_MEMCHR_H
+
+#include "prism/internal/encoding.h"
+
+#include <stddef.h>
+
+/*
+ * We need to roll our own memchr to handle cases where the encoding changes and
+ * we need to search for a character in a buffer that could be the trailing byte
+ * of a multibyte character.
+ */
+const void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding);
+
+#endif
diff --git a/prism/internal/node.h b/prism/internal/node.h
new file mode 100644
index 0000000000..ca6d5616d7
--- /dev/null
+++ b/prism/internal/node.h
@@ -0,0 +1,32 @@
+#ifndef PRISM_INTERNAL_NODE_H
+#define PRISM_INTERNAL_NODE_H
+
+#include "prism/node.h"
+
+#include "prism/compiler/force_inline.h"
+
+#include "prism/arena.h"
+
+/*
+ * Slow path for pm_node_list_append: grow the list and append the node.
+ * Do not call directly — use pm_node_list_append instead.
+ */
+void pm_node_list_append_slow(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node);
+
+/* Append a new node onto the end of the node list. */
+static PRISM_FORCE_INLINE void
+pm_node_list_append(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) {
+    if (list->size < list->capacity) {
+        list->nodes[list->size++] = node;
+    } else {
+        pm_node_list_append_slow(arena, list, node);
+    }
+}
+
+/* Prepend a new node onto the beginning of the node list. */
+void pm_node_list_prepend(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node);
+
+/* Concatenate the given node list onto the end of the other node list. */
+void pm_node_list_concat(pm_arena_t *arena, pm_node_list_t *list, pm_node_list_t *other);
+
+#endif
diff --git a/prism/internal/options.h b/prism/internal/options.h
new file mode 100644
index 0000000000..7e37742a8b
--- /dev/null
+++ b/prism/internal/options.h
@@ -0,0 +1,212 @@
+#ifndef PRISM_INTERNAL_OPTIONS_H
+#define PRISM_INTERNAL_OPTIONS_H
+
+#include "prism/options.h"
+
+/* A scope of locals surrounding the code that is being parsed. */
+struct pm_options_scope_t {
+    /* The number of locals in the scope. */
+    size_t locals_count;
+
+    /* The names of the locals in the scope. */
+    pm_string_t *locals;
+
+    /* Flags for the set of forwarding parameters in this scope. */
+    uint8_t forwarding;
+};
+
+/*
+ * The version of Ruby syntax that we should be parsing with. This is used to
+ * allow consumers to specify which behavior they want in case they need to
+ * parse in the same way as a specific version of CRuby would have.
+ */
+typedef enum {
+    /*
+     * If an explicit version is not provided, the current version of prism will
+     * be used.
+     */
+    PM_OPTIONS_VERSION_UNSET = 0,
+
+    /* The vendored version of prism in CRuby 3.3.x. */
+    PM_OPTIONS_VERSION_CRUBY_3_3 = 1,
+
+    /* The vendored version of prism in CRuby 3.4.x. */
+    PM_OPTIONS_VERSION_CRUBY_3_4 = 2,
+
+    /* The vendored version of prism in CRuby 4.0.x. */
+    PM_OPTIONS_VERSION_CRUBY_3_5 = 3,
+
+    /* The vendored version of prism in CRuby 4.0.x. */
+    PM_OPTIONS_VERSION_CRUBY_4_0 = 3,
+
+    /* The vendored version of prism in CRuby 4.1.x. */
+    PM_OPTIONS_VERSION_CRUBY_4_1 = 4,
+
+    /* The current version of prism. */
+    PM_OPTIONS_VERSION_LATEST = PM_OPTIONS_VERSION_CRUBY_4_1
+} pm_options_version_t;
+
+/* The options that can be passed to the parser. */
+struct pm_options_t {
+    /*
+     * The callback to call when additional switches are found in a shebang
+     * comment.
+     */
+    pm_options_shebang_callback_t shebang_callback;
+
+    /*
+     * Any additional data that should be passed along to the shebang callback
+     * if one was set.
+     */
+    void *shebang_callback_data;
+
+    /* The name of the file that is currently being parsed. */
+    pm_string_t filepath;
+
+    /*
+     * The line within the file that the parse starts on. This value is
+     * 1-indexed.
+     */
+    int32_t line;
+
+    /*
+     * The name of the encoding that the source file is in. Note that this must
+     * correspond to a name that can be found with Encoding.find in Ruby.
+     */
+    pm_string_t encoding;
+
+    /* The number of scopes surrounding the code that is being parsed. */
+    size_t scopes_count;
+
+    /*
+     * The scopes surrounding the code that is being parsed. For most parses
+     * this will be NULL, but for evals it will be the locals that are in scope
+     * surrounding the eval. Scopes are ordered from the outermost scope to the
+     * innermost one.
+     */
+    pm_options_scope_t *scopes;
+
+    /*
+     * The version of prism that we should be parsing with. This is used to
+     * allow consumers to specify which behavior they want in case they need to
+     * parse exactly as a specific version of CRuby.
+     */
+    pm_options_version_t version;
+
+    /* A bitset of the various options that were set on the command line. */
+    uint8_t command_line;
+
+    /*
+    * Whether or not the frozen string literal option has been set.
+    * May be:
+    *  - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED
+    *  - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED
+    *  - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
+    */
+    int8_t frozen_string_literal;
+
+    /*
+     * Whether or not the encoding magic comments should be respected. This is a
+     * niche use-case where you want to parse a file with a specific encoding
+     * but ignore any encoding magic comments at the top of the file.
+     */
+    bool encoding_locked;
+
+    /*
+     * When the file being parsed is the main script, the shebang will be
+     * considered for command-line flags (or for implicit -x). The caller needs
+     * to pass this information to the parser so that it can behave correctly.
+     */
+    bool main_script;
+
+    /*
+     * When the file being parsed is considered a "partial" script, jumps will
+     * not be marked as errors if they are not contained within loops/blocks.
+     * This is used in the case that you're parsing a script that you know will
+     * be embedded inside another script later, but you do not have that context
+     * yet. For example, when parsing an ERB template that will be evaluated
+     * inside another script.
+     */
+    bool partial_script;
+
+    /*
+     * Whether or not the parser should freeze the nodes that it creates. This
+     * makes it possible to have a deeply frozen AST that is safe to share
+     * between concurrency primitives.
+     */
+    bool freeze;
+};
+
+/* Free the internal memory associated with the options. */
+void pm_options_cleanup(pm_options_t *options);
+
+/*
+ * Deserialize an options struct from the given binary string. This is used to
+ * pass options to the parser from an FFI call so that consumers of the library
+ * from an FFI perspective don't have to worry about the structure of our
+ * options structs. Since the source of these calls will be from Ruby
+ * implementation internals we assume it is from a trusted source.
+ *
+ * `data` is assumed to be a valid pointer pointing to well-formed data. The
+ * layout of this data should be the same every time, and is described below:
+ *
+ * | # bytes | field                      |
+ * | ------- | -------------------------- |
+ * | `4`     | the length of the filepath |
+ * | ...     | the filepath bytes         |
+ * | `4`     | the line number            |
+ * | `4`     | the length the encoding    |
+ * | ...     | the encoding bytes         |
+ * | `1`     | frozen string literal      |
+ * | `1`     | -p command line option     |
+ * | `1`     | -n command line option     |
+ * | `1`     | -l command line option     |
+ * | `1`     | -a command line option     |
+ * | `1`     | the version                |
+ * | `1`     | encoding locked            |
+ * | `1`     | main script                |
+ * | `1`     | partial script             |
+ * | `1`     | freeze                     |
+ * | `4`     | the number of scopes       |
+ * | ...     | the scopes                 |
+ *
+ * The version field is an enum, so it should be one of the following values:
+ *
+ * | value | version                   |
+ * | ----- | ------------------------- |
+ * | `0`   | use the latest version of prism |
+ * | `1`   | use the version of prism that is vendored in CRuby 3.3.0 |
+ * | `2`   | use the version of prism that is vendored in CRuby 3.4.0 |
+ * | `3`   | use the version of prism that is vendored in CRuby 4.0.0 |
+ * | `4`   | use the version of prism that is vendored in CRuby 4.1.0 |
+ *
+ * Each scope is laid out as follows:
+ *
+ * | # bytes | field                      |
+ * | ------- | -------------------------- |
+ * | `4`     | the number of locals       |
+ * | `1`     | the forwarding flags       |
+ * | ...     | the locals                 |
+ *
+ * Each local is laid out as follows:
+ *
+ * | # bytes | field                      |
+ * | ------- | -------------------------- |
+ * | `4`     | the length of the local    |
+ * | ...     | the local bytes            |
+ *
+ * Some additional things to note about this layout:
+ *
+ * * The filepath can have a length of 0, in which case we'll consider it an
+ *   empty string.
+ * * The line number should be 0-indexed.
+ * * The encoding can have a length of 0, in which case we'll use the default
+ *   encoding (UTF-8). If it's not 0, it should correspond to a name of an
+ *   encoding that can be passed to `Encoding.find` in Ruby.
+ * * The frozen string literal, encoding locked, main script, and partial script
+ *   fields are booleans, so their values should be either 0 or 1.
+ * * The number of scopes can be 0.
+ */
+void pm_options_read(pm_options_t *options, const char *data);
+
+#endif
diff --git a/prism/internal/parser.h b/prism/internal/parser.h
new file mode 100644
index 0000000000..4320cf4029
--- /dev/null
+++ b/prism/internal/parser.h
@@ -0,0 +1,958 @@
+#ifndef PRISM_INTERNAL_PARSER_H
+#define PRISM_INTERNAL_PARSER_H
+
+#include "prism/compiler/accel.h"
+
+#include "prism/internal/arena.h"
+#include "prism/internal/constant_pool.h"
+#include "prism/internal/encoding.h"
+#include "prism/internal/list.h"
+#include "prism/internal/options.h"
+#include "prism/internal/static_literals.h"
+#include "prism/internal/strpbrk.h"
+
+#include "prism/ast.h"
+#include "prism/line_offset_list.h"
+#include "prism/parser.h"
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * This enum provides various bits that represent different kinds of states that
+ * the lexer can track. This is used to determine which kind of token to return
+ * based on the context of the parser.
+ */
+typedef enum {
+    PM_LEX_STATE_BIT_BEG,
+    PM_LEX_STATE_BIT_END,
+    PM_LEX_STATE_BIT_ENDARG,
+    PM_LEX_STATE_BIT_ENDFN,
+    PM_LEX_STATE_BIT_ARG,
+    PM_LEX_STATE_BIT_CMDARG,
+    PM_LEX_STATE_BIT_MID,
+    PM_LEX_STATE_BIT_FNAME,
+    PM_LEX_STATE_BIT_DOT,
+    PM_LEX_STATE_BIT_CLASS,
+    PM_LEX_STATE_BIT_LABEL,
+    PM_LEX_STATE_BIT_LABELED,
+    PM_LEX_STATE_BIT_FITEM
+} pm_lex_state_bit_t;
+
+/*
+ * This enum combines the various bits from the above enum into individual
+ * values that represent the various states of the lexer.
+ */
+typedef enum {
+    PM_LEX_STATE_NONE = 0,
+    PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
+    PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
+    PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
+    PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
+    PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
+    PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
+    PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
+    PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
+    PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
+    PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
+    PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
+    PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
+    PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
+    PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
+    PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
+    PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
+} pm_lex_state_t;
+
+/*
+ * The type of quote that a heredoc uses.
+ */
+typedef enum {
+    PM_HEREDOC_QUOTE_NONE,
+    PM_HEREDOC_QUOTE_SINGLE = '\'',
+    PM_HEREDOC_QUOTE_DOUBLE = '"',
+    PM_HEREDOC_QUOTE_BACKTICK = '`',
+} pm_heredoc_quote_t;
+
+/*
+ * The type of indentation that a heredoc uses.
+ */
+typedef enum {
+    PM_HEREDOC_INDENT_NONE,
+    PM_HEREDOC_INDENT_DASH,
+    PM_HEREDOC_INDENT_TILDE,
+} pm_heredoc_indent_t;
+
+/*
+ * All of the information necessary to store to lexing a heredoc.
+ */
+typedef struct {
+    /* A pointer to the start of the heredoc identifier. */
+    const uint8_t *ident_start;
+
+    /* The length of the heredoc identifier. */
+    size_t ident_length;
+
+    /* The type of quote that the heredoc uses. */
+    pm_heredoc_quote_t quote;
+
+    /* The type of indentation that the heredoc uses. */
+    pm_heredoc_indent_t indent;
+} pm_heredoc_lex_mode_t;
+
+/*
+ * When lexing Ruby source, the lexer has a small amount of state to tell which
+ * kind of token it is currently lexing. For example, when we find the start of
+ * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
+ * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
+ * are found as part of a string.
+ */
+typedef struct pm_lex_mode {
+    /* The type of this lex mode. */
+    enum {
+        /* This state is used when any given token is being lexed. */
+        PM_LEX_DEFAULT,
+
+        /*
+         * This state is used when we're lexing as normal but inside an embedded
+         * expression of a string.
+         */
+        PM_LEX_EMBEXPR,
+
+        /*
+         * This state is used when we're lexing a variable that is embedded
+         * directly inside of a string with the # shorthand.
+         */
+        PM_LEX_EMBVAR,
+
+        /* This state is used when you are inside the content of a heredoc. */
+        PM_LEX_HEREDOC,
+
+        /*
+         * This state is used when we are lexing a list of tokens, as in a %w
+         * word list literal or a %i symbol list literal.
+         */
+        PM_LEX_LIST,
+
+        /*
+         * This state is used when a regular expression has been begun and we
+         * are looking for the terminator.
+         */
+        PM_LEX_REGEXP,
+
+        /*
+         * This state is used when we are lexing a string or a string-like
+         * token, as in string content with either quote or an xstring.
+         */
+        PM_LEX_STRING
+    } mode;
+
+    /* The data associated with this type of lex mode. */
+    union {
+        struct {
+            /* This keeps track of the nesting level of the list. */
+            size_t nesting;
+
+            /* Whether or not interpolation is allowed in this list. */
+            bool interpolation;
+
+            /*
+             * When lexing a list, it takes into account balancing the
+             * terminator if the terminator is one of (), [], {}, or <>.
+             */
+            uint8_t incrementor;
+
+            /* This is the terminator of the list literal. */
+            uint8_t terminator;
+
+            /*
+             * This is the character set that should be used to delimit the
+             * tokens within the list.
+             */
+            uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
+        } list;
+
+        struct {
+            /*
+             * This keeps track of the nesting level of the regular expression.
+             */
+            size_t nesting;
+
+            /*
+             * When lexing a regular expression, it takes into account balancing
+             * the terminator if the terminator is one of (), [], {}, or <>.
+             */
+            uint8_t incrementor;
+
+            /* This is the terminator of the regular expression. */
+            uint8_t terminator;
+
+            /*
+             * This is the character set that should be used to delimit the
+             * tokens within the regular expression.
+             */
+            uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
+        } regexp;
+
+        struct {
+            /* This keeps track of the nesting level of the string. */
+            size_t nesting;
+
+            /* Whether or not interpolation is allowed in this string. */
+            bool interpolation;
+
+            /*
+             * Whether or not at the end of the string we should allow a :,
+             * which would indicate this was a dynamic symbol instead of a
+             * string.
+             */
+            bool label_allowed;
+
+            /*
+             * When lexing a string, it takes into account balancing the
+             * terminator if the terminator is one of (), [], {}, or <>.
+             */
+            uint8_t incrementor;
+
+            /*
+             * This is the terminator of the string. It is typically either a
+             * single or double quote.
+             */
+            uint8_t terminator;
+
+            /*
+             * This is the character set that should be used to delimit the
+             * tokens within the string.
+             */
+            uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE];
+        } string;
+
+        struct {
+            /*
+             * All of the data necessary to lex a heredoc.
+             */
+            pm_heredoc_lex_mode_t base;
+
+            /*
+             * This is the pointer to the character where lexing should resume
+             * once the heredoc has been completely processed.
+             */
+            const uint8_t *next_start;
+
+            /*
+             * This is used to track the amount of common whitespace on each
+             * line so that we know how much to dedent each line in the case of
+             * a tilde heredoc.
+             */
+            size_t *common_whitespace;
+
+            /* True if the previous token ended with a line continuation. */
+            bool line_continuation;
+        } heredoc;
+    } as;
+
+    /* The previous lex state so that it knows how to pop. */
+    struct pm_lex_mode *prev;
+} pm_lex_mode_t;
+
+/*
+ * We pre-allocate a certain number of lex states in order to avoid having to
+ * call malloc too many times while parsing. You really shouldn't need more than
+ * this because you only really nest deeply when doing string interpolation.
+ */
+#define PM_LEX_STACK_SIZE 4
+
+/*
+ * While parsing, we keep track of a stack of contexts. This is helpful for
+ * error recovery so that we can pop back to a previous context when we hit a
+ * token that is understood by a parent context but not by the current context.
+ */
+typedef enum {
+    /* a null context, used for returning a value from a function */
+    PM_CONTEXT_NONE = 0,
+
+    /* a begin statement */
+    PM_CONTEXT_BEGIN,
+
+    /* an ensure statement with an explicit begin */
+    PM_CONTEXT_BEGIN_ENSURE,
+
+    /* a rescue else statement with an explicit begin */
+    PM_CONTEXT_BEGIN_ELSE,
+
+    /* a rescue statement with an explicit begin */
+    PM_CONTEXT_BEGIN_RESCUE,
+
+    /* expressions in block arguments using braces */
+    PM_CONTEXT_BLOCK_BRACES,
+
+    /* expressions in block arguments using do..end */
+    PM_CONTEXT_BLOCK_KEYWORDS,
+
+    /* an ensure statement within a do..end block */
+    PM_CONTEXT_BLOCK_ENSURE,
+
+    /* a rescue else statement within a do..end block */
+    PM_CONTEXT_BLOCK_ELSE,
+
+    /* expressions in block parameters `foo do |...| end ` */
+    PM_CONTEXT_BLOCK_PARAMETERS,
+
+    /* a rescue statement within a do..end block */
+    PM_CONTEXT_BLOCK_RESCUE,
+
+    /* a case when statements */
+    PM_CONTEXT_CASE_WHEN,
+
+    /* a case in statements */
+    PM_CONTEXT_CASE_IN,
+
+    /* a class declaration */
+    PM_CONTEXT_CLASS,
+
+    /* an ensure statement within a class statement */
+    PM_CONTEXT_CLASS_ENSURE,
+
+    /* a rescue else statement within a class statement */
+    PM_CONTEXT_CLASS_ELSE,
+
+    /* a rescue statement within a class statement */
+    PM_CONTEXT_CLASS_RESCUE,
+
+    /* a method definition */
+    PM_CONTEXT_DEF,
+
+    /* an ensure statement within a method definition */
+    PM_CONTEXT_DEF_ENSURE,
+
+    /* a rescue else statement within a method definition */
+    PM_CONTEXT_DEF_ELSE,
+
+    /* a rescue statement within a method definition */
+    PM_CONTEXT_DEF_RESCUE,
+
+    /* a method definition's parameters */
+    PM_CONTEXT_DEF_PARAMS,
+
+    /* a defined? expression */
+    PM_CONTEXT_DEFINED,
+
+    /* a method definition's default parameter */
+    PM_CONTEXT_DEFAULT_PARAMS,
+
+    /* an else clause */
+    PM_CONTEXT_ELSE,
+
+    /* an elsif clause */
+    PM_CONTEXT_ELSIF,
+
+    /* an interpolated expression */
+    PM_CONTEXT_EMBEXPR,
+
+    /* a for loop */
+    PM_CONTEXT_FOR,
+
+    /* a for loop's index */
+    PM_CONTEXT_FOR_INDEX,
+
+    /* an if statement */
+    PM_CONTEXT_IF,
+
+    /* a lambda expression with braces */
+    PM_CONTEXT_LAMBDA_BRACES,
+
+    /* a lambda expression with do..end */
+    PM_CONTEXT_LAMBDA_DO_END,
+
+    /* an ensure statement within a lambda expression */
+    PM_CONTEXT_LAMBDA_ENSURE,
+
+    /* a rescue else statement within a lambda expression */
+    PM_CONTEXT_LAMBDA_ELSE,
+
+    /* a rescue statement within a lambda expression */
+    PM_CONTEXT_LAMBDA_RESCUE,
+
+    /* the predicate clause of a loop statement */
+    PM_CONTEXT_LOOP_PREDICATE,
+
+    /* the top level context */
+    PM_CONTEXT_MAIN,
+
+    /* a module declaration */
+    PM_CONTEXT_MODULE,
+
+    /* an ensure statement within a module statement */
+    PM_CONTEXT_MODULE_ENSURE,
+
+    /* a rescue else statement within a module statement */
+    PM_CONTEXT_MODULE_ELSE,
+
+    /* a rescue statement within a module statement */
+    PM_CONTEXT_MODULE_RESCUE,
+
+    /* a multiple target expression */
+    PM_CONTEXT_MULTI_TARGET,
+
+    /* a parenthesized expression */
+    PM_CONTEXT_PARENS,
+
+    /* an END block */
+    PM_CONTEXT_POSTEXE,
+
+    /* a predicate inside an if/elsif/unless statement */
+    PM_CONTEXT_PREDICATE,
+
+    /* a BEGIN block */
+    PM_CONTEXT_PREEXE,
+
+    /* a modifier rescue clause */
+    PM_CONTEXT_RESCUE_MODIFIER,
+
+    /* a singleton class definition */
+    PM_CONTEXT_SCLASS,
+
+    /* an ensure statement with a singleton class */
+    PM_CONTEXT_SCLASS_ENSURE,
+
+    /* a rescue else statement with a singleton class */
+    PM_CONTEXT_SCLASS_ELSE,
+
+    /* a rescue statement with a singleton class */
+    PM_CONTEXT_SCLASS_RESCUE,
+
+    /* a ternary expression */
+    PM_CONTEXT_TERNARY,
+
+    /* an unless statement */
+    PM_CONTEXT_UNLESS,
+
+    /* an until statement */
+    PM_CONTEXT_UNTIL,
+
+    /* a while statement */
+    PM_CONTEXT_WHILE,
+} pm_context_t;
+
+/* This is a node in a linked list of contexts. */
+typedef struct pm_context_node {
+    /* The context that this node represents. */
+    pm_context_t context;
+
+    /* A pointer to the previous context in the linked list. */
+    struct pm_context_node *prev;
+} pm_context_node_t;
+
+/* The type of shareable constant value that can be set. */
+typedef uint8_t pm_shareable_constant_value_t;
+static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0;
+static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL;
+static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING;
+static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY;
+
+/*
+ * This tracks an individual local variable in a certain lexical context, as
+ * well as the number of times is it read.
+ */
+typedef struct {
+    /* The name of the local variable. */
+    pm_constant_id_t name;
+
+    /* The location of the local variable in the source. */
+    pm_location_t location;
+
+    /* The index of the local variable in the local table. */
+    uint32_t index;
+
+    /* The number of times the local variable is read. */
+    uint32_t reads;
+
+    /* The hash of the local variable. */
+    uint32_t hash;
+} pm_local_t;
+
+/*
+ * This is a set of local variables in a certain lexical context (method, class,
+ * module, etc.). We need to track how many times these variables are read in
+ * order to warn if they only get written.
+ */
+typedef struct pm_locals {
+    /* The number of local variables in the set. */
+    uint32_t size;
+
+    /* The capacity of the local variables set. */
+    uint32_t capacity;
+
+    /*
+     * A bloom filter over constant IDs stored in this set. Used to quickly
+     * reject lookups for names that are definitely not present, avoiding the
+     * cost of a linear scan or hash probe.
+     */
+    uint32_t bloom;
+
+    /* The nullable allocated memory for the local variables in the set. */
+    pm_local_t *locals;
+} pm_locals_t;
+
+/* The flags about scope parameters that can be set. */
+typedef uint8_t pm_scope_parameters_t;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20;
+static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40;
+
+/*
+ * This struct represents a node in a linked list of scopes. Some scopes can see
+ * into their parent scopes, while others cannot.
+ */
+typedef struct pm_scope {
+    /* A pointer to the previous scope in the linked list. */
+    struct pm_scope *previous;
+
+    /* The IDs of the locals in the given scope. */
+    pm_locals_t locals;
+
+    /*
+     * This is a list of the implicit parameters contained within the block.
+     * These will be processed after the block is parsed to determine the kind
+     * of parameters node that should be used and to check if any errors need to
+     * be added.
+     */
+    pm_node_list_t implicit_parameters;
+
+    /*
+     * This is a bitfield that indicates the parameters that are being used in
+     * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants.
+     * There are three different kinds of parameters that can be used in a
+     * scope:
+     *
+     * - Ordinary parameters (e.g., def foo(bar); end)
+     * - Numbered parameters (e.g., def foo; _1; end)
+     * - The it parameter (e.g., def foo; it; end)
+     *
+     * If ordinary parameters are being used, then certain parameters can be
+     * forwarded to another method/structure. Those are indicated by four
+     * additional bits in the params field. For example, some combinations of:
+     *
+     * - def foo(*); end
+     * - def foo(**); end
+     * - def foo(&); end
+     * - def foo(...); end
+     */
+    pm_scope_parameters_t parameters;
+
+    /*
+     * The current state of constant shareability for this scope. This is
+     * changed by magic shareable_constant_value comments.
+     */
+    pm_shareable_constant_value_t shareable_constant;
+
+    /*
+     * A boolean indicating whether or not this scope can see into its parent.
+     * If closed is true, then the scope cannot see into its parent.
+     */
+    bool closed;
+} pm_scope_t;
+
+/*
+ * A struct that represents a stack of boolean values.
+ */
+typedef uint32_t pm_state_stack_t;
+
+/*
+ * This struct represents the overall parser. It contains a reference to the
+ * source file, as well as pointers that indicate where in the source it's
+ * currently parsing. It also contains the most recent and current token that
+ * it's considering.
+ */
+struct pm_parser_t {
+    /* The arena used for all AST-lifetime allocations. Caller-owned. */
+    pm_arena_t *arena;
+
+    /* The arena used for parser metadata (comments, diagnostics, etc.). */
+    pm_arena_t metadata_arena;
+
+    /*
+     * The next node identifier that will be assigned. This is a unique
+     * identifier used to track nodes such that the syntax tree can be dropped
+     * but the node can be found through another parse.
+     */
+    uint32_t node_id;
+
+    /*
+     * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant
+     * constant pool lookups when the same token is resolved multiple times
+     * (e.g., once during lexing for local variable detection, and again
+     * during parsing for node creation).
+     */
+    struct {
+        const uint8_t *start;
+        const uint8_t *end;
+        pm_constant_id_t id;
+    } constant_cache;
+
+    /* The current state of the lexer. */
+    pm_lex_state_t lex_state;
+
+    /* Tracks the current nesting of (), [], and {}. */
+    int enclosure_nesting;
+
+    /*
+     * Used to temporarily track the nesting of enclosures to determine if a {
+     * is the beginning of a lambda following the parameters of a lambda.
+     */
+    int lambda_enclosure_nesting;
+
+    /*
+     * Used to track the nesting of braces to ensure we get the correct value
+     * when we are interpolating blocks with braces.
+     */
+    int brace_nesting;
+
+    /*
+     * The stack used to determine if a do keyword belongs to the predicate of a
+     * while, until, or for loop.
+     */
+    pm_state_stack_t do_loop_stack;
+
+    /*
+     * The stack used to determine if a do keyword belongs to the beginning of a
+     * block.
+     */
+    pm_state_stack_t accepts_block_stack;
+
+    /* A stack of lex modes. */
+    struct {
+        /* The current mode of the lexer. */
+        pm_lex_mode_t *current;
+
+        /* The stack of lexer modes. */
+        pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
+
+        /* The current index into the lexer mode stack. */
+        size_t index;
+    } lex_modes;
+
+    /* The pointer to the start of the source. */
+    const uint8_t *start;
+
+    /* The pointer to the end of the source. */
+    const uint8_t *end;
+
+    /* The previous token we were considering. */
+    pm_token_t previous;
+
+    /* The current token we're considering. */
+    pm_token_t current;
+
+    /*
+     * This is a special field set on the parser when we need the parser to jump
+     * to a specific location when lexing the next token, as opposed to just
+     * using the end of the previous token. Normally this is NULL.
+     */
+    const uint8_t *next_start;
+
+    /*
+     * This field indicates the end of a heredoc whose identifier was found on
+     * the current line. If another heredoc is found on the same line, then this
+     * will be moved forward to the end of that heredoc. If no heredocs are
+     * found on a line then this is NULL.
+     */
+    const uint8_t *heredoc_end;
+
+    /* The list of comments that have been found while parsing. */
+    pm_list_t comment_list;
+
+    /* The list of magic comments that have been found while parsing. */
+    pm_list_t magic_comment_list;
+
+    /*
+     * An optional location that represents the location of the __END__ marker
+     * and the rest of the content of the file. This content is loaded into the
+     * DATA constant when the file being parsed is the main file being executed.
+     */
+    pm_location_t data_loc;
+
+    /* The list of warnings that have been found while parsing. */
+    pm_list_t warning_list;
+
+    /* The list of errors that have been found while parsing. */
+    pm_list_t error_list;
+
+    /* The current local scope. */
+    pm_scope_t *current_scope;
+
+    /* The current parsing context. */
+    pm_context_node_t *current_context;
+
+    /*
+     * The hash keys for the hash that is currently being parsed. This is not
+     * usually necessary because it can pass it down the various call chains,
+     * but in the event that you're parsing a hash that is being directly
+     * pushed into another hash with **, we need to share the hash keys so that
+     * we can warn for the nested hash as well.
+     */
+    pm_static_literals_t *current_hash_keys;
+
+    /*
+     * The encoding functions for the current file is attached to the parser as
+     * it's parsing so that it can change with a magic comment.
+     */
+    const pm_encoding_t *encoding;
+
+    /*
+     * When the encoding that is being used to parse the source is changed by
+     * prism, we provide the ability here to call out to a user-defined
+     * function.
+     */
+    pm_encoding_changed_callback_t encoding_changed_callback;
+
+    /*
+     * This pointer indicates where a comment must start if it is to be
+     * considered an encoding comment.
+     */
+    const uint8_t *encoding_comment_start;
+
+    /*
+     * When you are lexing through a file, the lexer needs all of the information
+     * that the parser additionally provides (for example, the local table). So if
+     * you want to properly lex Ruby, you need to actually lex it in the context of
+     * the parser. In order to provide this functionality, we optionally allow a
+     * struct to be attached to the parser that calls back out to a user-provided
+     * callback when each token is lexed.
+     */
+    struct {
+        /*
+         * This is the callback that is called when a token is lexed. It is
+         * passed the opaque data pointer, the parser, and the token that was
+         * lexed.
+         */
+        pm_lex_callback_t callback;
+
+        /*
+         * This opaque pointer is used to provide whatever information the user
+         * deemed necessary to the callback. In our case we use it to pass the
+         * array that the tokens get appended into.
+         */
+        void *data;
+    } lex_callback;
+
+    /*
+     * This is the path of the file being parsed. We use the filepath when
+     * constructing SourceFileNodes.
+     */
+    pm_string_t filepath;
+
+    /*
+     * This constant pool keeps all of the constants defined throughout the file
+     * so that we can reference them later.
+     */
+    pm_constant_pool_t constant_pool;
+
+    /* This is the list of line offsets in the source file. */
+    pm_line_offset_list_t line_offsets;
+
+    /*
+     * State communicated from the lexer to the parser for integer tokens.
+     */
+    struct {
+        /*
+         * A flag indicating the base of the integer (binary, octal, decimal,
+         * hexadecimal). Set during lexing and read during node creation.
+         */
+        pm_node_flags_t base;
+
+        /*
+         * When lexing a decimal integer that fits in a uint32_t, we compute
+         * the value during lexing to avoid re-scanning the digits during
+         * parsing. If lexed is true, this holds the result and
+         * pm_integer_parse can be skipped.
+         */
+        uint32_t value;
+
+        /* Whether value holds a valid pre-computed integer. */
+        bool lexed;
+    } integer;
+
+    /*
+     * This string is used to pass information from the lexer to the parser. It
+     * is particularly necessary because of escape sequences.
+     */
+    pm_string_t current_string;
+
+    /*
+     * The line number at the start of the parse. This will be used to offset
+     * the line numbers of all of the locations.
+     */
+    int32_t start_line;
+
+    /*
+     * When a string-like expression is being lexed, any byte or escape sequence
+     * that resolves to a value whose top bit is set (i.e., >= 0x80) will
+     * explicitly set the encoding to the same encoding as the source.
+     * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
+     * resolves to a value whose top bit is set, then the encoding will be
+     * explicitly set to UTF-8.
+     *
+     * The _next_ time this happens, if the encoding that is about to become the
+     * explicitly set encoding does not match the previously set explicit
+     * encoding, a mixed encoding error will be emitted.
+     *
+     * When the expression is finished being lexed, the explicit encoding
+     * controls the encoding of the expression. For the most part this means
+     * that the expression will either be encoded in the source encoding or
+     * UTF-8. This holds for all encodings except US-ASCII. If the source is
+     * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
+     * expression will be encoded as ASCII-8BIT.
+     *
+     * Note that if the expression is a list, different elements within the same
+     * list can have different encodings, so this will get reset between each
+     * element. Furthermore all of this only applies to lists that support
+     * interpolation, because otherwise escapes that could change the encoding
+     * are ignored.
+     *
+     * At first glance, it may make more sense for this to live on the lexer
+     * mode, but we need it here to communicate back to the parser for character
+     * literals that do not push a new lexer mode.
+     */
+    const pm_encoding_t *explicit_encoding;
+
+    /*
+     * When parsing block exits (e.g., break, next, redo), we need to validate
+     * that they are in correct contexts. For the most part we can do this by
+     * looking at our parent contexts. However, modifier while and until
+     * expressions can change that context to make block exits valid. In these
+     * cases, we need to keep track of the block exits and then validate them
+     * after the expression has been parsed.
+     *
+     * We use a pointer here because we don't want to keep a whole list attached
+     * since this will only be used in the context of begin/end expressions.
+     */
+    pm_node_list_t *current_block_exits;
+
+    /* The version of prism that we should use to parse. */
+    pm_options_version_t version;
+
+    /* The command line flags given from the options. */
+    uint8_t command_line;
+
+    /*
+     * Whether or not we have found a frozen_string_literal magic comment with
+     * a true or false value.
+     * May be:
+     *  - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED
+     *  - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED
+     *  - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
+     */
+    int8_t frozen_string_literal;
+
+    /*
+     * Whether or not we are parsing an eval string. This impacts whether or not
+     * we should evaluate if block exits/yields are valid.
+     */
+    bool parsing_eval;
+
+    /*
+     * Whether or not we are parsing a "partial" script, which is a script that
+     * will be evaluated in the context of another script, so we should not
+     * check jumps (next/break/etc.) for validity.
+     */
+    bool partial_script;
+
+    /* Whether or not we're at the beginning of a command. */
+    bool command_start;
+
+    /*
+     * Whether or not we're currently parsing the body of an endless method
+     * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be
+     * consumed by commands (it should bubble up to the outer context).
+     */
+    bool in_endless_def_body;
+
+    /* Whether or not we're currently recovering from a syntax error. */
+    bool recovering;
+
+    /*
+     * Whether or not the source being parsed could become valid if more input
+     * were appended. This is set to false when the parser encounters a token
+     * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to
+     * merely incomplete.
+     */
+    bool continuable;
+
+    /*
+     * This is very specialized behavior for when you want to parse in a context
+     * that does not respect encoding comments. Its main use case is translating
+     * into the whitequark/parser AST which re-encodes source files in UTF-8
+     * before they are parsed and ignores encoding comments.
+     */
+    bool encoding_locked;
+
+    /*
+     * Whether or not the encoding has been changed by a magic comment. We use
+     * this to provide a fast path for the lexer instead of going through the
+     * function pointer.
+     */
+    bool encoding_changed;
+
+    /*
+     * This flag indicates that we are currently parsing a pattern matching
+     * expression and impacts that calculation of newlines.
+     */
+    bool pattern_matching_newlines;
+
+    /* This flag indicates that we are currently parsing a keyword argument. */
+    bool in_keyword_arg;
+
+    /*
+     * Whether or not the parser has seen a token that has semantic meaning
+     * (i.e., a token that is not a comment or whitespace).
+     */
+    bool semantic_token_seen;
+
+    /*
+     * By default, Ruby always warns about mismatched indentation. This can be
+     * toggled with a magic comment.
+     */
+    bool warn_mismatched_indentation;
+
+#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR)
+    /*
+     * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding
+     * the nibble-based tables on every call when the charset hasn't changed
+     * (which is the common case during string/regex/list lexing).
+     */
+    struct {
+        /* The cached charset (null-terminated, max 11 chars + NUL). */
+        uint8_t charset[12];
+
+        /* Nibble-based low lookup table for SIMD matching. */
+        uint8_t low_lut[16];
+
+        /* Nibble-based high lookup table for SIMD matching. */
+        uint8_t high_lut[16];
+
+        /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */
+        uint64_t table[4];
+    } strpbrk_cache;
+#endif
+};
+
+/*
+ * Initialize a parser with the given start and end pointers.
+ */
+void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options);
+
+/*
+ * Free the memory held by the given parser.
+ *
+ * This does not free the `pm_options_t` object that was used to initialize the
+ * parser.
+ */
+void pm_parser_cleanup(pm_parser_t *parser);
+
+#endif
diff --git a/prism/internal/regexp.h b/prism/internal/regexp.h
new file mode 100644
index 0000000000..3710c984fc
--- /dev/null
+++ b/prism/internal/regexp.h
@@ -0,0 +1,41 @@
+#ifndef PRISM_INTERNAL_REGEXP_H
+#define PRISM_INTERNAL_REGEXP_H
+
+#include "prism/ast.h"
+#include "prism/parser.h"
+
+/*
+ * Accumulation state for named capture groups found during regexp parsing.
+ * The caller initializes this with the call node and passes it to
+ * pm_regexp_parse. The regexp parser populates match and names as groups
+ * are found.
+ */
+typedef struct {
+    /* The call node wrapping the regular expression node (for =~). */
+    pm_call_node_t *call;
+
+    /* The match write node being built, or NULL if no captures found yet. */
+    pm_match_write_node_t *match;
+
+    /* The list of capture names found so far (for deduplication). */
+    pm_constant_id_list_t names;
+} pm_regexp_name_data_t;
+
+/*
+ * Callback invoked by pm_regexp_parse() for each named capture group found.
+ */
+typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data);
+
+/*
+ * Parse a regular expression, validate its encoding, and optionally extract
+ * named capture groups. Returns the encoding flags to set on the node.
+ */
+PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
+
+/*
+ * Parse an interpolated regular expression for named capture groups only.
+ * No encoding validation is performed.
+ */
+void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data);
+
+#endif
diff --git a/prism/internal/serialize.h b/prism/internal/serialize.h
new file mode 100644
index 0000000000..e611a0374b
--- /dev/null
+++ b/prism/internal/serialize.h
@@ -0,0 +1,34 @@
+#ifndef PRISM_INTERNAL_SERIALIZE_H
+#define PRISM_INTERNAL_SERIALIZE_H
+
+#include "prism/internal/encoding.h"
+#include "prism/internal/list.h"
+
+#include "prism/ast.h"
+#include "prism/buffer.h"
+#include "prism/excludes.h"
+#include "prism/parser.h"
+
+/* We optionally support serializing to a binary string. For systems that do not
+ * want or need this functionality, it can be turned off with the
+ * PRISM_EXCLUDE_SERIALIZATION define. */
+#ifndef PRISM_EXCLUDE_SERIALIZATION
+
+/*
+ * Serialize the given list of comments to the given buffer.
+ */
+void pm_serialize_comment_list(pm_list_t *list, pm_buffer_t *buffer);
+
+/*
+ * Serialize the name of the encoding to the buffer.
+ */
+void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer);
+
+/*
+ * Serialize the encoding, metadata, nodes, and constant pool.
+ */
+void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer);
+
+#endif
+
+#endif
diff --git a/prism/internal/source.h b/prism/internal/source.h
new file mode 100644
index 0000000000..b3c2b55be3
--- /dev/null
+++ b/prism/internal/source.h
@@ -0,0 +1,72 @@
+#ifndef PRISM_INTERNAL_SOURCE_H
+#define PRISM_INTERNAL_SOURCE_H
+
+#include "prism/source.h"
+#include "prism/buffer.h"
+
+#include <stdbool.h>
+
+/*
+ * The type of source, which determines cleanup behavior.
+ */
+typedef enum {
+    /* Wraps existing constant memory, no cleanup. */
+    PM_SOURCE_CONSTANT,
+
+    /* Wraps existing shared memory (non-owning slice), no cleanup. */
+    PM_SOURCE_SHARED,
+
+    /* Owns a heap-allocated buffer, freed on cleanup. */
+    PM_SOURCE_OWNED,
+
+    /* Memory-mapped file, unmapped on cleanup. */
+    PM_SOURCE_MAPPED,
+
+    /* Stream source backed by a pm_buffer_t. */
+    PM_SOURCE_STREAM
+} pm_source_type_t;
+
+/*
+ * The internal representation of a source.
+ */
+struct pm_source_t {
+    /* A pointer to the start of the source data. */
+    const uint8_t *source;
+
+    /* The length of the source data in bytes. */
+    size_t length;
+
+    /* The type of the source. */
+    pm_source_type_t type;
+
+    /* Stream-specific data, only used for PM_SOURCE_STREAM sources. */
+    struct {
+        /* The buffer that holds the accumulated stream data. */
+        pm_buffer_t *buffer;
+
+        /* The stream object to read from. */
+        void *stream;
+
+        /* The function to use to read from the stream. */
+        pm_source_stream_fgets_t *fgets;
+
+        /* The function to use to check if the stream is at EOF. */
+        pm_source_stream_feof_t *feof;
+
+        /* Whether the stream has reached EOF. */
+        bool eof;
+    } stream;
+};
+
+/*
+ * Read from a stream into the source's internal buffer. This is used by
+ * pm_parse_stream to incrementally read the source.
+ */
+bool pm_source_stream_read(pm_source_t *source);
+
+/*
+ * Returns whether the stream source has reached EOF.
+ */
+bool pm_source_stream_eof(const pm_source_t *source);
+
+#endif
diff --git a/prism/internal/static_literals.h b/prism/internal/static_literals.h
new file mode 100644
index 0000000000..d59002ac0a
--- /dev/null
+++ b/prism/internal/static_literals.h
@@ -0,0 +1,98 @@
+#ifndef PRISM_INTERNAL_STATIC_LITERALS_H
+#define PRISM_INTERNAL_STATIC_LITERALS_H
+
+#include "prism/ast.h"
+#include "prism/buffer.h"
+#include "prism/line_offset_list.h"
+
+/*
+ * An internal hash table for a set of nodes.
+ */
+typedef struct {
+    /* The array of nodes in the hash table. */
+    pm_node_t **nodes;
+
+    /* The size of the hash table. */
+    uint32_t size;
+
+    /* The space that has been allocated in the hash table. */
+    uint32_t capacity;
+} pm_node_hash_t;
+
+/*
+ * Certain sets of nodes (hash keys and when clauses) check for duplicate nodes
+ * to alert the user of potential issues. To do this, we keep a set of the nodes
+ * that have been seen so far, and compare whenever we find a new node.
+ *
+ * We bucket the nodes based on their type to minimize the number of comparisons
+ * that need to be performed.
+ */
+typedef struct {
+    /*
+     * This is the set of IntegerNode and SourceLineNode instances.
+     */
+    pm_node_hash_t integer_nodes;
+
+    /*
+     * This is the set of FloatNode instances.
+     */
+    pm_node_hash_t float_nodes;
+
+    /*
+     * This is the set of RationalNode and ImaginaryNode instances.
+     */
+    pm_node_hash_t number_nodes;
+
+    /*
+     * This is the set of StringNode and SourceFileNode instances.
+     */
+    pm_node_hash_t string_nodes;
+
+    /*
+     * This is the set of RegularExpressionNode instances.
+     */
+    pm_node_hash_t regexp_nodes;
+
+    /*
+     * This is the set of SymbolNode instances.
+     */
+    pm_node_hash_t symbol_nodes;
+
+    /*
+     * A pointer to the last TrueNode instance that was inserted, or NULL.
+     */
+    pm_node_t *true_node;
+
+    /*
+     * A pointer to the last FalseNode instance that was inserted, or NULL.
+     */
+    pm_node_t *false_node;
+
+    /*
+     * A pointer to the last NilNode instance that was inserted, or NULL.
+     */
+    pm_node_t *nil_node;
+
+    /*
+     * A pointer to the last SourceEncodingNode instance that was inserted, or
+     * NULL.
+     */
+    pm_node_t *source_encoding_node;
+} pm_static_literals_t;
+
+/*
+ * Add a node to the set of static literals.
+ */
+pm_node_t * pm_static_literals_add(const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace);
+
+/*
+ * Free the internal memory associated with the given static literals set.
+ */
+void pm_static_literals_free(pm_static_literals_t *literals);
+
+/*
+ * Create a string-based representation of the given static literal.
+ */
+void pm_static_literal_inspect(pm_buffer_t *buffer, const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, const char *encoding_name, const pm_node_t *node);
+
+#endif
diff --git a/prism/internal/stringy.h b/prism/internal/stringy.h
new file mode 100644
index 0000000000..1aaa23ea75
--- /dev/null
+++ b/prism/internal/stringy.h
@@ -0,0 +1,30 @@
+#ifndef PRISM_INTERNAL_STRINGY_H
+#define PRISM_INTERNAL_STRINGY_H
+
+#include "prism/stringy.h"
+
+/*
+ * Defines an empty string. This is useful for initializing a string that will
+ * be filled in later.
+ */
+#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 })
+
+/*
+ * Initialize a shared string that is based on initial input.
+ */
+void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end);
+
+/*
+ * Compare the underlying lengths and bytes of two strings. Returns 0 if the
+ * strings are equal, a negative number if the left string is less than the
+ * right string, and a positive number if the left string is greater than the
+ * right string.
+ */
+int pm_string_compare(const pm_string_t *left, const pm_string_t *right);
+
+/*
+ * Free the associated memory of the given string.
+ */
+void pm_string_cleanup(pm_string_t *string);
+
+#endif
diff --git a/prism/internal/strncasecmp.h b/prism/internal/strncasecmp.h
new file mode 100644
index 0000000000..775f6a993e
--- /dev/null
+++ b/prism/internal/strncasecmp.h
@@ -0,0 +1,18 @@
+#ifndef PRISM_INTERNAL_STRNCASECMP_H
+#define PRISM_INTERNAL_STRNCASECMP_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * Compare two strings, ignoring case, up to the given length. Returns 0 if the
+ * strings are equal, a negative number if string1 is less than string2, or a
+ * positive number if string1 is greater than string2.
+ *
+ * Note that this is effectively our own implementation of strncasecmp, but it's
+ * not available on all of the platforms we want to support so we're rolling it
+ * here.
+ */
+int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length);
+
+#endif
diff --git a/prism/internal/strpbrk.h b/prism/internal/strpbrk.h
new file mode 100644
index 0000000000..d64156c002
--- /dev/null
+++ b/prism/internal/strpbrk.h
@@ -0,0 +1,33 @@
+#ifndef PRISM_INTERNAL_STRPBRK_H
+#define PRISM_INTERNAL_STRPBRK_H
+
+#include "prism/parser.h"
+
+/* The maximum number of bytes in a strpbrk charset. */
+#define PM_STRPBRK_CACHE_SIZE 16
+
+#include <stddef.h>
+#include <stdint.h>
+
+/*
+ * Here we have rolled our own version of strpbrk. The standard library strpbrk
+ * has undefined behavior when the source string is not null-terminated. We want
+ * to support strings that are not null-terminated because pm_parse does not
+ * have the contract that the string is null-terminated. (This is desirable
+ * because it means the extension can call pm_parse with the result of a call to
+ * mmap).
+ *
+ * The standard library strpbrk also does not support passing a maximum length
+ * to search. We want to support this for the reason mentioned above, but we
+ * also don't want it to stop on null bytes. Ruby actually allows null bytes
+ * within strings, comments, regular expressions, etc. So we need to be able to
+ * skip past them.
+ *
+ * Finally, we want to support encodings wherein the charset could contain
+ * characters that are trailing bytes of multi-byte characters. For example, in
+ * Shift-JIS, the backslash character can be a trailing byte. In that case we
+ * need to take a slower path and iterate one multi-byte character at a time.
+ */
+const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate);
+
+#endif
diff --git a/prism/internal/tokens.h b/prism/internal/tokens.h
new file mode 100644
index 0000000000..3a983e54ae
--- /dev/null
+++ b/prism/internal/tokens.h
@@ -0,0 +1,11 @@
+#ifndef PRISM_INTERNAL_TOKENS_H
+#define PRISM_INTERNAL_TOKENS_H
+
+#include "prism/ast.h"
+
+/*
+ * Returns the human name of the given token type.
+ */
+const char * pm_token_str(pm_token_type_t token_type);
+
+#endif