diff options
Diffstat (limited to 'prism/internal')
| -rw-r--r-- | prism/internal/allocator.h | 68 | ||||
| -rw-r--r-- | prism/internal/allocator_debug.h | 88 | ||||
| -rw-r--r-- | prism/internal/arena.h | 108 | ||||
| -rw-r--r-- | prism/internal/bit.h | 42 | ||||
| -rw-r--r-- | prism/internal/buffer.h | 91 | ||||
| -rw-r--r-- | prism/internal/char.h | 139 | ||||
| -rw-r--r-- | prism/internal/comments.h | 20 | ||||
| -rw-r--r-- | prism/internal/constant_pool.h | 117 | ||||
| -rw-r--r-- | prism/internal/encoding.h | 242 | ||||
| -rw-r--r-- | prism/internal/integer.h | 68 | ||||
| -rw-r--r-- | prism/internal/isinf.h | 16 | ||||
| -rw-r--r-- | prism/internal/line_offset_list.h | 34 | ||||
| -rw-r--r-- | prism/internal/list.h | 62 | ||||
| -rw-r--r-- | prism/internal/magic_comments.h | 23 | ||||
| -rw-r--r-- | prism/internal/memchr.h | 15 | ||||
| -rw-r--r-- | prism/internal/node.h | 32 | ||||
| -rw-r--r-- | prism/internal/options.h | 212 | ||||
| -rw-r--r-- | prism/internal/parser.h | 958 | ||||
| -rw-r--r-- | prism/internal/regexp.h | 41 | ||||
| -rw-r--r-- | prism/internal/serialize.h | 34 | ||||
| -rw-r--r-- | prism/internal/source.h | 72 | ||||
| -rw-r--r-- | prism/internal/static_literals.h | 98 | ||||
| -rw-r--r-- | prism/internal/stringy.h | 30 | ||||
| -rw-r--r-- | prism/internal/strncasecmp.h | 18 | ||||
| -rw-r--r-- | prism/internal/strpbrk.h | 33 | ||||
| -rw-r--r-- | prism/internal/tokens.h | 11 |
26 files changed, 2672 insertions, 0 deletions
diff --git a/prism/internal/allocator.h b/prism/internal/allocator.h new file mode 100644 index 0000000000..6c54010dbf --- /dev/null +++ b/prism/internal/allocator.h @@ -0,0 +1,68 @@ +#ifndef PRISM_INTERNAL_ALLOCATOR_H +#define PRISM_INTERNAL_ALLOCATOR_H + +/* If you build Prism with a custom allocator, configure it with + * "-D PRISM_XALLOCATOR" to use your own allocator that defines xmalloc, + * xrealloc, xcalloc, and xfree. + * + * For example, your `prism_xallocator.h` file could look like this: + * + * ``` + * #ifndef PRISM_XALLOCATOR_H + * #define PRISM_XALLOCATOR_H + * #define xmalloc my_malloc + * #define xrealloc my_realloc + * #define xcalloc my_calloc + * #define xfree my_free + * #define xrealloc_sized my_realloc_sized // (optional) + * #define xfree_sized my_free_sized // (optional) + * #endif + * ``` + */ +#ifdef PRISM_XALLOCATOR + #include "prism_xallocator.h" +#else + #ifndef xmalloc + /* The malloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xmalloc malloc + #endif + + #ifndef xrealloc + /* The realloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xrealloc realloc + #endif + + #ifndef xcalloc + /* The calloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xcalloc calloc + #endif + + #ifndef xfree + /* The free function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xfree free + #endif +#endif + +#ifndef xfree_sized + /* The free_sized function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. If not defined, defaults to calling xfree. + */ + #define xfree_sized(p, s) xfree(((void)(s), (p))) +#endif + +#ifndef xrealloc_sized + /* The xrealloc_sized function that should be used. This can be overridden + * with the PRISM_XALLOCATOR define. If not defined, defaults to calling + * xrealloc. */ + #define xrealloc_sized(p, ns, os) xrealloc((p), ((void)(os), (ns))) +#endif + +#ifdef PRISM_BUILD_DEBUG + #include "prism/internal/allocator_debug.h" +#endif + +#endif diff --git a/prism/internal/allocator_debug.h b/prism/internal/allocator_debug.h new file mode 100644 index 0000000000..846e96ba2d --- /dev/null +++ b/prism/internal/allocator_debug.h @@ -0,0 +1,88 @@ +#ifndef PRISM_INTERNAL_ALLOCATOR_DEBUG_H +#define PRISM_INTERNAL_ALLOCATOR_DEBUG_H + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +static inline void * +pm_allocator_debug_malloc(size_t size) { + size_t *memory = xmalloc(size + sizeof(size_t)); + memory[0] = size; + return memory + 1; +} + +static inline void * +pm_allocator_debug_calloc(size_t nmemb, size_t size) { + size_t total_size = nmemb * size; + void *ptr = pm_allocator_debug_malloc(total_size); + memset(ptr, 0, total_size); + return ptr; +} + +static inline void * +pm_allocator_debug_realloc(void *ptr, size_t size) { + if (ptr == NULL) { + return pm_allocator_debug_malloc(size); + } + + size_t *memory = (size_t *)ptr; + void *raw_memory = memory - 1; + memory = (size_t *)xrealloc(raw_memory, size + sizeof(size_t)); + memory[0] = size; + return memory + 1; +} + +static inline void +pm_allocator_debug_free(void *ptr) { + if (ptr != NULL) { + size_t *memory = (size_t *)ptr; + xfree(memory - 1); + } +} + +static inline void +pm_allocator_debug_free_sized(void *ptr, size_t old_size) { + if (ptr != NULL) { + size_t *memory = (size_t *)ptr; + if (old_size != memory[-1]) { + fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but freed with size %lu\n", ptr, memory[-1], old_size); + abort(); + } + xfree_sized(memory - 1, old_size + sizeof(size_t)); + } +} + +static inline void * +pm_allocator_debug_realloc_sized(void *ptr, size_t size, size_t old_size) { + if (ptr == NULL) { + if (old_size != 0) { + fprintf(stderr, "[BUG] realloc_sized called with NULL pointer and old size %lu\n", old_size); + abort(); + } + return pm_allocator_debug_malloc(size); + } + + size_t *memory = (size_t *)ptr; + if (old_size != memory[-1]) { + fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but realloced with size %lu\n", ptr, memory[-1], old_size); + abort(); + } + return pm_allocator_debug_realloc(ptr, size); +} + +#undef xmalloc +#undef xrealloc +#undef xcalloc +#undef xfree +#undef xrealloc_sized +#undef xfree_sized + +#define xmalloc pm_allocator_debug_malloc +#define xrealloc pm_allocator_debug_realloc +#define xcalloc pm_allocator_debug_calloc +#define xfree pm_allocator_debug_free +#define xrealloc_sized pm_allocator_debug_realloc_sized +#define xfree_sized pm_allocator_debug_free_sized + +#endif diff --git a/prism/internal/arena.h b/prism/internal/arena.h new file mode 100644 index 0000000000..2e413b42bf --- /dev/null +++ b/prism/internal/arena.h @@ -0,0 +1,108 @@ +#ifndef PRISM_INTERNAL_ARENA_H +#define PRISM_INTERNAL_ARENA_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/flex_array.h" +#include "prism/compiler/force_inline.h" +#include "prism/compiler/inline.h" + +#include "prism/arena.h" + +#include <stddef.h> +#include <string.h> + +/* + * A single block of memory in the arena. Blocks are linked via prev pointers so + * they can be freed by walking the chain. + */ +typedef struct pm_arena_block { + /* The previous block in the chain (for freeing). */ + struct pm_arena_block *prev; + + /* The total usable bytes in data[]. */ + size_t capacity; + + /* The number of bytes consumed so far. */ + size_t used; + + /* The block's data. */ + char data[PM_FLEX_ARRAY_LENGTH]; +} pm_arena_block_t; + +/* + * A bump allocator. Allocations are made by bumping a pointer within the + * current block. When a block is full, a new block is allocated and linked to + * the previous one. All blocks are freed at once by walking the chain. + */ +struct pm_arena_t { + /* The active block (allocate from here). */ + pm_arena_block_t *current; + + /* The number of blocks allocated. */ + size_t block_count; +}; + +/* + * Free all blocks in the arena. After this call, all pointers returned by + * pm_arena_alloc and pm_arena_zalloc are invalid. + */ +void pm_arena_cleanup(pm_arena_t *arena); + +/* + * Ensure the arena has at least `capacity` bytes available in its current + * block, allocating a new block if necessary. This allows callers to + * pre-size the arena to avoid repeated small block allocations. + */ +void pm_arena_reserve(pm_arena_t *arena, size_t capacity); + +/* + * Slow path for pm_arena_alloc: allocate a new block and return a pointer to + * the first `size` bytes. Do not call directly — use pm_arena_alloc instead. + */ +void * pm_arena_alloc_slow(pm_arena_t *arena, size_t size); + +/* + * Allocate memory from the arena. The returned memory is NOT zeroed. This + * function is infallible — it aborts on allocation failure. + * + * The fast path (bump pointer within the current block) is inlined at each + * call site. The slow path (new block allocation) is out-of-line. + */ +static PRISM_FORCE_INLINE void * +pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment) { + if (arena->current != NULL) { + size_t used_aligned = (arena->current->used + alignment - 1) & ~(alignment - 1); + size_t needed = used_aligned + size; + + if (used_aligned >= arena->current->used && needed >= used_aligned && needed <= arena->current->capacity) { + arena->current->used = needed; + return arena->current->data + used_aligned; + } + } + + return pm_arena_alloc_slow(arena, size); +} + +/* + * Allocate zero-initialized memory from the arena. This function is infallible + * — it aborts on allocation failure. + */ +static PRISM_INLINE void * +pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment) { + void *ptr = pm_arena_alloc(arena, size, alignment); + memset(ptr, 0, size); + return ptr; +} + +/* + * Allocate memory from the arena and copy the given data into it. This is a + * convenience wrapper around pm_arena_alloc + memcpy. + */ +static PRISM_INLINE void * +pm_arena_memdup(pm_arena_t *arena, const void *src, size_t size, size_t alignment) { + void *dst = pm_arena_alloc(arena, size, alignment); + memcpy(dst, src, size); + return dst; +} + +#endif diff --git a/prism/internal/bit.h b/prism/internal/bit.h new file mode 100644 index 0000000000..b0111a4c2c --- /dev/null +++ b/prism/internal/bit.h @@ -0,0 +1,42 @@ +#ifndef PRISM_INTERNAL_BIT_H +#define PRISM_INTERNAL_BIT_H + +#include "prism/compiler/inline.h" + +/* + * Count trailing zero bits in a 64-bit value. Used by SWAR identifier scanning + * to find the first non-matching byte in a word. + * + * Precondition: v must be nonzero. The result is undefined when v == 0 + * (matching the behavior of __builtin_ctzll and _BitScanForward64). + */ +#if defined(__GNUC__) || defined(__clang__) +#define pm_ctzll(v) ((unsigned) __builtin_ctzll(v)) +#elif defined(_MSC_VER) +#include <intrin.h> +#include <stdint.h> + +static PRISM_INLINE unsigned +pm_ctzll(uint64_t v) { + unsigned long index; + _BitScanForward64(&index, v); + return (unsigned) index; +} +#else +#include <stdint.h> + +static PRISM_INLINE unsigned +pm_ctzll(uint64_t v) { + unsigned c = 0; + v &= (uint64_t) (-(int64_t) v); + if (v & 0x00000000FFFFFFFFULL) c += 0; else c += 32; + if (v & 0x0000FFFF0000FFFFULL) c += 0; else c += 16; + if (v & 0x00FF00FF00FF00FFULL) c += 0; else c += 8; + if (v & 0x0F0F0F0F0F0F0F0FULL) c += 0; else c += 4; + if (v & 0x3333333333333333ULL) c += 0; else c += 2; + if (v & 0x5555555555555555ULL) c += 0; else c += 1; + return c; +} +#endif + +#endif diff --git a/prism/internal/buffer.h b/prism/internal/buffer.h new file mode 100644 index 0000000000..a849bbf8e6 --- /dev/null +++ b/prism/internal/buffer.h @@ -0,0 +1,91 @@ +#ifndef PRISM_INTERNAL_BUFFER_H +#define PRISM_INTERNAL_BUFFER_H + +#include "prism/compiler/format.h" + +#include "prism/buffer.h" + +#include <stdbool.h> +#include <stdint.h> + +/* + * A simple memory buffer that stores data in a contiguous block of memory. + */ +struct pm_buffer_t { + /* The length of the buffer in bytes. */ + size_t length; + + /* The capacity of the buffer in bytes that has been allocated. */ + size_t capacity; + + /* A pointer to the start of the buffer. */ + char *value; +}; + +/* Initialize a pm_buffer_t with the given capacity. */ +void pm_buffer_init(pm_buffer_t *buffer, size_t capacity); + +/* Free the memory held by the buffer. */ +void pm_buffer_cleanup(pm_buffer_t *buffer); + +/* Append the given amount of space as zeroes to the buffer. */ +void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length); + +/* Append a formatted string to the buffer. */ +void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3); + +/* Append a string to the buffer. */ +void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length); + +/* Append a list of bytes to the buffer. */ +void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length); + +/* Append a single byte to the buffer. */ +void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value); + +/* Append a 32-bit unsigned integer to the buffer as a variable-length integer. */ +void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value); + +/* Append a 32-bit signed integer to the buffer as a variable-length integer. */ +void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); + +/* Append a double to the buffer. */ +void pm_buffer_append_double(pm_buffer_t *buffer, double value); + +/* Append a unicode codepoint to the buffer. */ +bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value); + +/* + * The different types of escaping that can be performed by the buffer when + * appending a slice of Ruby source code. + */ +typedef enum { + PM_BUFFER_ESCAPING_RUBY, + PM_BUFFER_ESCAPING_JSON +} pm_buffer_escaping_t; + +/* Append a slice of source code to the buffer. */ +void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping); + +/* Prepend the given string to the buffer. */ +void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length); + +/* Concatenate one buffer onto another. */ +void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source); + +/* + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + */ +void pm_buffer_clear(pm_buffer_t *buffer); + +/* Strip the whitespace from the end of the buffer. */ +void pm_buffer_rstrip(pm_buffer_t *buffer); + +/* Checks if the buffer includes the given value. */ +size_t pm_buffer_index(const pm_buffer_t *buffer, char value); + +/* Insert the given string into the buffer at the given index. */ +void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length); + +#endif diff --git a/prism/internal/char.h b/prism/internal/char.h new file mode 100644 index 0000000000..9a58fba8c5 --- /dev/null +++ b/prism/internal/char.h @@ -0,0 +1,139 @@ +#ifndef PRISM_INTERNAL_CHAR_H +#define PRISM_INTERNAL_CHAR_H + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" +#include "prism/line_offset_list.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/* Bit flag for whitespace characters in pm_byte_table. */ +#define PRISM_CHAR_BIT_WHITESPACE (1 << 0) + +/* Bit flag for inline whitespace characters in pm_byte_table. */ +#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1) + +/* + * A lookup table for classifying bytes. Each entry is a bitfield of + * PRISM_CHAR_BIT_* flags. Defined in char.c. + */ +extern const uint8_t pm_byte_table[256]; + +/* Returns true if the given character is a whitespace character. */ +static PRISM_FORCE_INLINE bool +pm_char_is_whitespace(const uint8_t b) { + return (pm_byte_table[b] & PRISM_CHAR_BIT_WHITESPACE) != 0; +} + +/* Returns true if the given character is an inline whitespace character. */ +static PRISM_FORCE_INLINE bool +pm_char_is_inline_whitespace(const uint8_t b) { + return (pm_byte_table[b] & PRISM_CHAR_BIT_INLINE_WHITESPACE) != 0; +} + +/* + * Returns the number of characters at the start of the string that are inline + * whitespace (space/tab). Scans the byte table directly for use in hot paths. + */ +static PRISM_FORCE_INLINE size_t +pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) { + if (length <= 0) return 0; + size_t size = 0; + size_t maximum = (size_t) length; + while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_INLINE_WHITESPACE)) size++; + return size; +} + +/* + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + */ +size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_arena_t *arena, pm_line_offset_list_t *line_offsets, uint32_t start_offset); + +/* + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + */ +size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + + +/* Returns true if the given character is a binary digit. */ +bool pm_char_is_binary_digit(const uint8_t b); + +/* Returns true if the given character is an octal digit. */ +bool pm_char_is_octal_digit(const uint8_t b); + +/* Returns true if the given character is a decimal digit. */ +bool pm_char_is_decimal_digit(const uint8_t b); + +/* Returns true if the given character is a hexadecimal digit. */ +bool pm_char_is_hexadecimal_digit(const uint8_t b); + +#endif diff --git a/prism/internal/comments.h b/prism/internal/comments.h new file mode 100644 index 0000000000..bb3039a658 --- /dev/null +++ b/prism/internal/comments.h @@ -0,0 +1,20 @@ +#ifndef PRISM_INTERNAL_COMMENTS_H +#define PRISM_INTERNAL_COMMENTS_H + +#include "prism/comments.h" + +#include "prism/internal/list.h" + +/* A comment found while parsing. */ +struct pm_comment_t { + /* The embedded base node. */ + pm_list_node_t node; + + /* The location of the comment in the source. */ + pm_location_t location; + + /* The type of the comment. */ + pm_comment_type_t type; +}; + +#endif diff --git a/prism/internal/constant_pool.h b/prism/internal/constant_pool.h new file mode 100644 index 0000000000..fa2be783f5 --- /dev/null +++ b/prism/internal/constant_pool.h @@ -0,0 +1,117 @@ +#ifndef PRISM_INTERNAL_CONSTANT_POOL_H +#define PRISM_INTERNAL_CONSTANT_POOL_H + +#include "prism/constant_pool.h" + +#include "prism/arena.h" + +#include <stdbool.h> + +/* A constant in the pool which effectively stores a string. */ +struct pm_constant_t { + /* A pointer to the start of the string. */ + const uint8_t *start; + + /* The length of the string. */ + size_t length; +}; + +/* + * The type of bucket in the constant pool hash map. This determines how the + * bucket should be freed. + */ +typedef unsigned int pm_constant_pool_bucket_type_t; + +/* By default, each constant is a slice of the source. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0; + +/* An owned constant is one for which memory has been allocated. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1; + +/* A constant constant is known at compile time. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2; + +/* A bucket in the hash map. */ +typedef struct { + /* The incremental ID used for indexing back into the pool. */ + unsigned int id: 30; + + /* The type of the bucket, which determines how to free it. */ + pm_constant_pool_bucket_type_t type: 2; + + /* The hash of the bucket. */ + uint32_t hash; + + /* + * A pointer to the start of the string, stored directly in the bucket to + * avoid a pointer chase to the constants array during probing. + */ + const uint8_t *start; + + /* The length of the string. */ + size_t length; +} pm_constant_pool_bucket_t; + +/* The overall constant pool, which stores constants found while parsing. */ +struct pm_constant_pool_t { + /* The buckets in the hash map. */ + pm_constant_pool_bucket_t *buckets; + + /* The constants that are stored in the buckets. */ + pm_constant_t *constants; + + /* The number of buckets in the hash map. */ + uint32_t size; + + /* The number of buckets that have been allocated in the hash map. */ + uint32_t capacity; +}; + +/* + * When we allocate constants into the pool, we reserve 0 to mean that the slot + * is not yet filled. This constant is reused in other places to indicate the + * lack of a constant id. + */ +#define PM_CONSTANT_ID_UNSET 0 + +/* Initialize a list of constant ids with a given capacity. */ +void pm_constant_id_list_init_capacity(pm_arena_t *arena, pm_constant_id_list_t *list, size_t capacity); + +/* Insert a constant id into a list of constant ids at the specified index. */ +void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id); + +/* Checks if the current constant id list includes the given constant id. */ +bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id); + +/* Initialize a new constant pool with a given capacity. */ +void pm_constant_pool_init(pm_arena_t *arena, pm_constant_pool_t *pool, uint32_t capacity); + +/* Return a pointer to the constant indicated by the given constant id. */ +pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id); + +/* + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + */ +pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool that is a slice of a source string. + * Returns the id of the constant, or 0 if any potential calls to resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_shared(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or 0 if any potential calls to + * resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_owned(pm_arena_t *arena, pm_constant_pool_t *pool, uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or 0 if any potential calls to resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_constant(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +#endif diff --git a/prism/internal/encoding.h b/prism/internal/encoding.h new file mode 100644 index 0000000000..62392ef970 --- /dev/null +++ b/prism/internal/encoding.h @@ -0,0 +1,242 @@ +#ifndef PRISM_INTERNAL_ENCODING_H +#define PRISM_INTERNAL_ENCODING_H + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/* + * This struct defines the functions necessary to implement the encoding + * interface so we can determine how many bytes the subsequent character takes. + * Each callback should return the number of bytes, or 0 if the next bytes are + * invalid for the encoding and type. + */ +typedef struct { + /* + * Return the number of bytes that the next character takes if it is valid + * in the encoding. Does not read more than n bytes. It is assumed that n is + * at least 1. + */ + size_t (*char_width)(const uint8_t *b, ptrdiff_t n); + + /* + * Return the number of bytes that the next character takes if it is valid + * in the encoding and is alphabetical. Does not read more than n bytes. It + * is assumed that n is at least 1. + */ + size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n); + + /* + * Return the number of bytes that the next character takes if it is valid + * in the encoding and is alphanumeric. Does not read more than n bytes. It + * is assumed that n is at least 1. + */ + size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n); + + /* + * Return true if the next character is valid in the encoding and is an + * uppercase character. Does not read more than n bytes. It is assumed that + * n is at least 1. + */ + bool (*isupper_char)(const uint8_t *b, ptrdiff_t n); + + /* + * The name of the encoding. This should correspond to a value that can be + * passed to Encoding.find in Ruby. + */ + const char *name; + + /* Return true if the encoding is a multibyte encoding. */ + bool multibyte; +} pm_encoding_t; + +/* + * All of the lookup tables use the first bit of each embedded byte to indicate + * whether the codepoint is alphabetical. + */ +#define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0 + +/* + * All of the lookup tables use the second bit of each embedded byte to indicate + * whether the codepoint is alphanumeric. + */ +#define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1 + +/* + * All of the lookup tables use the third bit of each embedded byte to indicate + * whether the codepoint is uppercase. + */ +#define PRISM_ENCODING_UPPERCASE_BIT 1 << 2 + +/* Return the size of the next character in the UTF-8 encoding. */ +size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n); + +/* + * Return the size of the next character in the UTF-8 encoding if it is an + * alphabetical character. + */ +size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n); + +/* + * Return the size of the next character in the UTF-8 encoding if it is an + * alphanumeric character. + */ +size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n); + +/* + * Return true if the next character in the UTF-8 encoding if it is an uppercase + * character. + */ +bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n); + +/* + * This lookup table is referenced in both the UTF-8 encoding file and the + * parser directly in order to speed up the default encoding processing. It is + * used to indicate whether a character is alphabetical, alphanumeric, or + * uppercase in unicode mappings. + */ +extern const uint8_t pm_encoding_unicode_table[256]; + +/* These are all of the encodings that prism supports. */ +typedef enum { + PM_ENCODING_UTF_8 = 0, + PM_ENCODING_US_ASCII, + PM_ENCODING_ASCII_8BIT, + PM_ENCODING_EUC_JP, + PM_ENCODING_WINDOWS_31J, + +/* We optionally support excluding the full set of encodings to only support the + * minimum necessary to process Ruby code without encoding comments. */ +#ifndef PRISM_ENCODING_EXCLUDE_FULL + PM_ENCODING_BIG5, + PM_ENCODING_BIG5_HKSCS, + PM_ENCODING_BIG5_UAO, + PM_ENCODING_CESU_8, + PM_ENCODING_CP51932, + PM_ENCODING_CP850, + PM_ENCODING_CP852, + PM_ENCODING_CP855, + PM_ENCODING_CP949, + PM_ENCODING_CP950, + PM_ENCODING_CP951, + PM_ENCODING_EMACS_MULE, + PM_ENCODING_EUC_JP_MS, + PM_ENCODING_EUC_JIS_2004, + PM_ENCODING_EUC_KR, + PM_ENCODING_EUC_TW, + PM_ENCODING_GB12345, + PM_ENCODING_GB18030, + PM_ENCODING_GB1988, + PM_ENCODING_GB2312, + PM_ENCODING_GBK, + PM_ENCODING_IBM437, + PM_ENCODING_IBM720, + PM_ENCODING_IBM737, + PM_ENCODING_IBM775, + PM_ENCODING_IBM852, + PM_ENCODING_IBM855, + PM_ENCODING_IBM857, + PM_ENCODING_IBM860, + PM_ENCODING_IBM861, + PM_ENCODING_IBM862, + PM_ENCODING_IBM863, + PM_ENCODING_IBM864, + PM_ENCODING_IBM865, + PM_ENCODING_IBM866, + PM_ENCODING_IBM869, + PM_ENCODING_ISO_8859_1, + PM_ENCODING_ISO_8859_2, + PM_ENCODING_ISO_8859_3, + PM_ENCODING_ISO_8859_4, + PM_ENCODING_ISO_8859_5, + PM_ENCODING_ISO_8859_6, + PM_ENCODING_ISO_8859_7, + PM_ENCODING_ISO_8859_8, + PM_ENCODING_ISO_8859_9, + PM_ENCODING_ISO_8859_10, + PM_ENCODING_ISO_8859_11, + PM_ENCODING_ISO_8859_13, + PM_ENCODING_ISO_8859_14, + PM_ENCODING_ISO_8859_15, + PM_ENCODING_ISO_8859_16, + PM_ENCODING_KOI8_R, + PM_ENCODING_KOI8_U, + PM_ENCODING_MAC_CENT_EURO, + PM_ENCODING_MAC_CROATIAN, + PM_ENCODING_MAC_CYRILLIC, + PM_ENCODING_MAC_GREEK, + PM_ENCODING_MAC_ICELAND, + PM_ENCODING_MAC_JAPANESE, + PM_ENCODING_MAC_ROMAN, + PM_ENCODING_MAC_ROMANIA, + PM_ENCODING_MAC_THAI, + PM_ENCODING_MAC_TURKISH, + PM_ENCODING_MAC_UKRAINE, + PM_ENCODING_SHIFT_JIS, + PM_ENCODING_SJIS_DOCOMO, + PM_ENCODING_SJIS_KDDI, + PM_ENCODING_SJIS_SOFTBANK, + PM_ENCODING_STATELESS_ISO_2022_JP, + PM_ENCODING_STATELESS_ISO_2022_JP_KDDI, + PM_ENCODING_TIS_620, + PM_ENCODING_UTF8_MAC, + PM_ENCODING_UTF8_DOCOMO, + PM_ENCODING_UTF8_KDDI, + PM_ENCODING_UTF8_SOFTBANK, + PM_ENCODING_WINDOWS_1250, + PM_ENCODING_WINDOWS_1251, + PM_ENCODING_WINDOWS_1252, + PM_ENCODING_WINDOWS_1253, + PM_ENCODING_WINDOWS_1254, + PM_ENCODING_WINDOWS_1255, + PM_ENCODING_WINDOWS_1256, + PM_ENCODING_WINDOWS_1257, + PM_ENCODING_WINDOWS_1258, + PM_ENCODING_WINDOWS_874, +#endif + + PM_ENCODING_MAXIMUM +} pm_encoding_type_t; + +/* This is the table of all of the encodings that prism supports. */ +extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; + +/* + * This is the default UTF-8 encoding. We need a reference to it to quickly + * create parsers. + */ +#define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) + +/* + * This is the US-ASCII encoding. We need a reference to it to be able to + * compare against it when a string is being created because it could possibly + * need to fall back to ASCII-8BIT. + */ +#define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) + +/* + * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk + * can compare against it because invalid multibyte characters are not a thing + * in this encoding. It is also needed for handling Regexp encoding flags. + */ +#define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) + +/* + * This is the EUC-JP encoding. We need a reference to it to quickly process + * regular expression modifiers. + */ +#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) + +/* + * This is the Windows-31J encoding. We need a reference to it to quickly + * process regular expression modifiers. + */ +#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) + +/* + * Parse the given name of an encoding and return a pointer to the corresponding + * encoding struct if one can be found, otherwise return NULL. + */ +const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end); + +#endif diff --git a/prism/internal/integer.h b/prism/internal/integer.h new file mode 100644 index 0000000000..7c9767e323 --- /dev/null +++ b/prism/internal/integer.h @@ -0,0 +1,68 @@ +/* + * This module provides functions for working with arbitrary-sized integers. + */ +#ifndef PRISM_INTERNAL_INTEGER_H +#define PRISM_INTERNAL_INTEGER_H + +#include "prism/buffer.h" +#include "prism/integer.h" + +#include <stdint.h> + +/* + * An enum controlling the base of an integer. It is expected that the base is + * already known before parsing the integer, even though it could be derived + * from the string itself. + */ +typedef enum { + /* The default decimal base, with no prefix. Leading 0s will be ignored. */ + PM_INTEGER_BASE_DEFAULT, + + /* The binary base, indicated by a 0b or 0B prefix. */ + PM_INTEGER_BASE_BINARY, + + /* The octal base, indicated by a 0, 0o, or 0O prefix. */ + PM_INTEGER_BASE_OCTAL, + + /* The decimal base, indicated by a 0d, 0D, or empty prefix. */ + PM_INTEGER_BASE_DECIMAL, + + /* The hexadecimal base, indicated by a 0x or 0X prefix. */ + PM_INTEGER_BASE_HEXADECIMAL, + + /* + * An unknown base, in which case pm_integer_parse will derive it based on + * the content of the string. This is less efficient and does more + * comparisons, so if callers know the base ahead of time, they should use + * that instead. + */ + PM_INTEGER_BASE_UNKNOWN +} pm_integer_base_t; + +/* + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + */ +void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end); + +/* + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + */ +int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right); + +/* + * Reduce a ratio of integers to its simplest form. + * + * If either the numerator or denominator do not fit into a 32-bit integer, then + * this function is a no-op. In the future, we may consider reducing even the + * larger numbers, but for now we're going to keep it simple. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator); + +/* Convert an integer to a decimal string. */ +void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer); + +#endif diff --git a/prism/internal/isinf.h b/prism/internal/isinf.h new file mode 100644 index 0000000000..41c160f56d --- /dev/null +++ b/prism/internal/isinf.h @@ -0,0 +1,16 @@ +#ifndef PRISM_INTERNAL_ISINF_H +#define PRISM_INTERNAL_ISINF_H + +/* + * isinf on POSIX systems accepts a float, a double, or a long double. But mingw + * didn't provide an isinf macro, only an isinf function that only accepts + * floats, so we need to use _finite instead. + */ +#ifdef __MINGW64__ + #include <float.h> + #define PRISM_ISINF(x) (!_finite(x)) +#else + #define PRISM_ISINF(x) isinf(x) +#endif + +#endif diff --git a/prism/internal/line_offset_list.h b/prism/internal/line_offset_list.h new file mode 100644 index 0000000000..dac9f7052e --- /dev/null +++ b/prism/internal/line_offset_list.h @@ -0,0 +1,34 @@ +#ifndef PRISM_INTERNAL_LINE_OFFSET_LIST_H +#define PRISM_INTERNAL_LINE_OFFSET_LIST_H + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" +#include "prism/line_offset_list.h" + +/* Initialize a new line offset list with the given capacity. */ +void pm_line_offset_list_init(pm_arena_t *arena, pm_line_offset_list_t *list, size_t capacity); + +/* Clear out the offsets that have been appended to the list. */ +void pm_line_offset_list_clear(pm_line_offset_list_t *list); + +/* Append a new offset to the list (slow path with resize). */ +void pm_line_offset_list_append_slow(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor); + +/* Append a new offset to the list. */ +static PRISM_FORCE_INLINE void +pm_line_offset_list_append(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor) { + if (list->size < list->capacity) { + list->offsets[list->size++] = cursor; + } else { + pm_line_offset_list_append_slow(arena, list, cursor); + } +} + +/* + * Returns the line of the given offset. If the offset is not in the list, the + * line of the closest offset less than the given offset is returned. + */ +int32_t pm_line_offset_list_line(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line); + +#endif diff --git a/prism/internal/list.h b/prism/internal/list.h new file mode 100644 index 0000000000..0ab59ef32a --- /dev/null +++ b/prism/internal/list.h @@ -0,0 +1,62 @@ +#ifndef PRISM_INTERNAL_LIST_H +#define PRISM_INTERNAL_LIST_H + +#include <stddef.h> + +/* + * This struct represents an abstract linked list that provides common + * functionality. It is meant to be used any time a linked list is necessary to + * store data. + * + * The linked list itself operates off a set of pointers. Because the pointers + * are not necessarily sequential, they can be of any size. We use this fact to + * allow the consumer of this linked list to extend the node struct to include + * any data they want. This is done by using the pm_list_node_t as the first + * member of the struct. + * + * For example, if we want to store a list of integers, we can do the following: + * + * ```c + * typedef struct { + * pm_list_node_t node; + * int value; + * } pm_int_node_t; + * + * pm_list_t list = { 0 }; + * pm_int_node_t *node = xmalloc(sizeof(pm_int_node_t)); + * node->value = 5; + * + * pm_list_append(&list, &node->node); + * ``` + * + * The pm_list_t struct is used to represent the overall linked list. It + * contains a pointer to the head and tail of the list. This allows for easy + * iteration and appending of new nodes. + */ +typedef struct pm_list_node { + /* A pointer to the next node in the list. */ + struct pm_list_node *next; +} pm_list_node_t; + +/* + * This represents the overall linked list. It keeps a pointer to the head and + * tail so that iteration is easy and pushing new nodes is easy. + */ +typedef struct { + /* The size of the list. */ + size_t size; + + /* A pointer to the head of the list. */ + pm_list_node_t *head; + + /* A pointer to the tail of the list. */ + pm_list_node_t *tail; +} pm_list_t; + +/* Returns the size of the list. */ +size_t pm_list_size(pm_list_t *list); + +/* Append a node to the given list. */ +void pm_list_append(pm_list_t *list, pm_list_node_t *node); + +#endif diff --git a/prism/internal/magic_comments.h b/prism/internal/magic_comments.h new file mode 100644 index 0000000000..72a581c5d7 --- /dev/null +++ b/prism/internal/magic_comments.h @@ -0,0 +1,23 @@ +#ifndef PRISM_INTERNAL_MAGIC_COMMENTS_H +#define PRISM_INTERNAL_MAGIC_COMMENTS_H + +#include "prism/magic_comments.h" + +#include "prism/internal/list.h" + +/* + * This is a node in the linked list of magic comments that we've found while + * parsing. + */ +struct pm_magic_comment_t { + /* The embedded base node. */ + pm_list_node_t node; + + /* The key of the magic comment. */ + pm_location_t key; + + /* The value of the magic comment. */ + pm_location_t value; +}; + +#endif diff --git a/prism/internal/memchr.h b/prism/internal/memchr.h new file mode 100644 index 0000000000..6f6b0bca30 --- /dev/null +++ b/prism/internal/memchr.h @@ -0,0 +1,15 @@ +#ifndef PRISM_INTERNAL_MEMCHR_H +#define PRISM_INTERNAL_MEMCHR_H + +#include "prism/internal/encoding.h" + +#include <stddef.h> + +/* + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + */ +const void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding); + +#endif diff --git a/prism/internal/node.h b/prism/internal/node.h new file mode 100644 index 0000000000..ca6d5616d7 --- /dev/null +++ b/prism/internal/node.h @@ -0,0 +1,32 @@ +#ifndef PRISM_INTERNAL_NODE_H +#define PRISM_INTERNAL_NODE_H + +#include "prism/node.h" + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" + +/* + * Slow path for pm_node_list_append: grow the list and append the node. + * Do not call directly — use pm_node_list_append instead. + */ +void pm_node_list_append_slow(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node); + +/* Append a new node onto the end of the node list. */ +static PRISM_FORCE_INLINE void +pm_node_list_append(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) { + if (list->size < list->capacity) { + list->nodes[list->size++] = node; + } else { + pm_node_list_append_slow(arena, list, node); + } +} + +/* Prepend a new node onto the beginning of the node list. */ +void pm_node_list_prepend(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node); + +/* Concatenate the given node list onto the end of the other node list. */ +void pm_node_list_concat(pm_arena_t *arena, pm_node_list_t *list, pm_node_list_t *other); + +#endif diff --git a/prism/internal/options.h b/prism/internal/options.h new file mode 100644 index 0000000000..7e37742a8b --- /dev/null +++ b/prism/internal/options.h @@ -0,0 +1,212 @@ +#ifndef PRISM_INTERNAL_OPTIONS_H +#define PRISM_INTERNAL_OPTIONS_H + +#include "prism/options.h" + +/* A scope of locals surrounding the code that is being parsed. */ +struct pm_options_scope_t { + /* The number of locals in the scope. */ + size_t locals_count; + + /* The names of the locals in the scope. */ + pm_string_t *locals; + + /* Flags for the set of forwarding parameters in this scope. */ + uint8_t forwarding; +}; + +/* + * The version of Ruby syntax that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse in the same way as a specific version of CRuby would have. + */ +typedef enum { + /* + * If an explicit version is not provided, the current version of prism will + * be used. + */ + PM_OPTIONS_VERSION_UNSET = 0, + + /* The vendored version of prism in CRuby 3.3.x. */ + PM_OPTIONS_VERSION_CRUBY_3_3 = 1, + + /* The vendored version of prism in CRuby 3.4.x. */ + PM_OPTIONS_VERSION_CRUBY_3_4 = 2, + + /* The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_3_5 = 3, + + /* The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_4_0 = 3, + + /* The vendored version of prism in CRuby 4.1.x. */ + PM_OPTIONS_VERSION_CRUBY_4_1 = 4, + + /* The current version of prism. */ + PM_OPTIONS_VERSION_LATEST = PM_OPTIONS_VERSION_CRUBY_4_1 +} pm_options_version_t; + +/* The options that can be passed to the parser. */ +struct pm_options_t { + /* + * The callback to call when additional switches are found in a shebang + * comment. + */ + pm_options_shebang_callback_t shebang_callback; + + /* + * Any additional data that should be passed along to the shebang callback + * if one was set. + */ + void *shebang_callback_data; + + /* The name of the file that is currently being parsed. */ + pm_string_t filepath; + + /* + * The line within the file that the parse starts on. This value is + * 1-indexed. + */ + int32_t line; + + /* + * The name of the encoding that the source file is in. Note that this must + * correspond to a name that can be found with Encoding.find in Ruby. + */ + pm_string_t encoding; + + /* The number of scopes surrounding the code that is being parsed. */ + size_t scopes_count; + + /* + * The scopes surrounding the code that is being parsed. For most parses + * this will be NULL, but for evals it will be the locals that are in scope + * surrounding the eval. Scopes are ordered from the outermost scope to the + * innermost one. + */ + pm_options_scope_t *scopes; + + /* + * The version of prism that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse exactly as a specific version of CRuby. + */ + pm_options_version_t version; + + /* A bitset of the various options that were set on the command line. */ + uint8_t command_line; + + /* + * Whether or not the frozen string literal option has been set. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /* + * Whether or not the encoding magic comments should be respected. This is a + * niche use-case where you want to parse a file with a specific encoding + * but ignore any encoding magic comments at the top of the file. + */ + bool encoding_locked; + + /* + * When the file being parsed is the main script, the shebang will be + * considered for command-line flags (or for implicit -x). The caller needs + * to pass this information to the parser so that it can behave correctly. + */ + bool main_script; + + /* + * When the file being parsed is considered a "partial" script, jumps will + * not be marked as errors if they are not contained within loops/blocks. + * This is used in the case that you're parsing a script that you know will + * be embedded inside another script later, but you do not have that context + * yet. For example, when parsing an ERB template that will be evaluated + * inside another script. + */ + bool partial_script; + + /* + * Whether or not the parser should freeze the nodes that it creates. This + * makes it possible to have a deeply frozen AST that is safe to share + * between concurrency primitives. + */ + bool freeze; +}; + +/* Free the internal memory associated with the options. */ +void pm_options_cleanup(pm_options_t *options); + +/* + * Deserialize an options struct from the given binary string. This is used to + * pass options to the parser from an FFI call so that consumers of the library + * from an FFI perspective don't have to worry about the structure of our + * options structs. Since the source of these calls will be from Ruby + * implementation internals we assume it is from a trusted source. + * + * `data` is assumed to be a valid pointer pointing to well-formed data. The + * layout of this data should be the same every time, and is described below: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the filepath | + * | ... | the filepath bytes | + * | `4` | the line number | + * | `4` | the length the encoding | + * | ... | the encoding bytes | + * | `1` | frozen string literal | + * | `1` | -p command line option | + * | `1` | -n command line option | + * | `1` | -l command line option | + * | `1` | -a command line option | + * | `1` | the version | + * | `1` | encoding locked | + * | `1` | main script | + * | `1` | partial script | + * | `1` | freeze | + * | `4` | the number of scopes | + * | ... | the scopes | + * + * The version field is an enum, so it should be one of the following values: + * + * | value | version | + * | ----- | ------------------------- | + * | `0` | use the latest version of prism | + * | `1` | use the version of prism that is vendored in CRuby 3.3.0 | + * | `2` | use the version of prism that is vendored in CRuby 3.4.0 | + * | `3` | use the version of prism that is vendored in CRuby 4.0.0 | + * | `4` | use the version of prism that is vendored in CRuby 4.1.0 | + * + * Each scope is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the number of locals | + * | `1` | the forwarding flags | + * | ... | the locals | + * + * Each local is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the local | + * | ... | the local bytes | + * + * Some additional things to note about this layout: + * + * * The filepath can have a length of 0, in which case we'll consider it an + * empty string. + * * The line number should be 0-indexed. + * * The encoding can have a length of 0, in which case we'll use the default + * encoding (UTF-8). If it's not 0, it should correspond to a name of an + * encoding that can be passed to `Encoding.find` in Ruby. + * * The frozen string literal, encoding locked, main script, and partial script + * fields are booleans, so their values should be either 0 or 1. + * * The number of scopes can be 0. + */ +void pm_options_read(pm_options_t *options, const char *data); + +#endif diff --git a/prism/internal/parser.h b/prism/internal/parser.h new file mode 100644 index 0000000000..4320cf4029 --- /dev/null +++ b/prism/internal/parser.h @@ -0,0 +1,958 @@ +#ifndef PRISM_INTERNAL_PARSER_H +#define PRISM_INTERNAL_PARSER_H + +#include "prism/compiler/accel.h" + +#include "prism/internal/arena.h" +#include "prism/internal/constant_pool.h" +#include "prism/internal/encoding.h" +#include "prism/internal/list.h" +#include "prism/internal/options.h" +#include "prism/internal/static_literals.h" +#include "prism/internal/strpbrk.h" + +#include "prism/ast.h" +#include "prism/line_offset_list.h" +#include "prism/parser.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/* + * This enum provides various bits that represent different kinds of states that + * the lexer can track. This is used to determine which kind of token to return + * based on the context of the parser. + */ +typedef enum { + PM_LEX_STATE_BIT_BEG, + PM_LEX_STATE_BIT_END, + PM_LEX_STATE_BIT_ENDARG, + PM_LEX_STATE_BIT_ENDFN, + PM_LEX_STATE_BIT_ARG, + PM_LEX_STATE_BIT_CMDARG, + PM_LEX_STATE_BIT_MID, + PM_LEX_STATE_BIT_FNAME, + PM_LEX_STATE_BIT_DOT, + PM_LEX_STATE_BIT_CLASS, + PM_LEX_STATE_BIT_LABEL, + PM_LEX_STATE_BIT_LABELED, + PM_LEX_STATE_BIT_FITEM +} pm_lex_state_bit_t; + +/* + * This enum combines the various bits from the above enum into individual + * values that represent the various states of the lexer. + */ +typedef enum { + PM_LEX_STATE_NONE = 0, + PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), + PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), + PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), + PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), + PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), + PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), + PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), + PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), + PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), + PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), + PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), + PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), + PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), + PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, + PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, + PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN +} pm_lex_state_t; + +/* + * The type of quote that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_QUOTE_NONE, + PM_HEREDOC_QUOTE_SINGLE = '\'', + PM_HEREDOC_QUOTE_DOUBLE = '"', + PM_HEREDOC_QUOTE_BACKTICK = '`', +} pm_heredoc_quote_t; + +/* + * The type of indentation that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_INDENT_NONE, + PM_HEREDOC_INDENT_DASH, + PM_HEREDOC_INDENT_TILDE, +} pm_heredoc_indent_t; + +/* + * All of the information necessary to store to lexing a heredoc. + */ +typedef struct { + /* A pointer to the start of the heredoc identifier. */ + const uint8_t *ident_start; + + /* The length of the heredoc identifier. */ + size_t ident_length; + + /* The type of quote that the heredoc uses. */ + pm_heredoc_quote_t quote; + + /* The type of indentation that the heredoc uses. */ + pm_heredoc_indent_t indent; +} pm_heredoc_lex_mode_t; + +/* + * When lexing Ruby source, the lexer has a small amount of state to tell which + * kind of token it is currently lexing. For example, when we find the start of + * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After + * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that + * are found as part of a string. + */ +typedef struct pm_lex_mode { + /* The type of this lex mode. */ + enum { + /* This state is used when any given token is being lexed. */ + PM_LEX_DEFAULT, + + /* + * This state is used when we're lexing as normal but inside an embedded + * expression of a string. + */ + PM_LEX_EMBEXPR, + + /* + * This state is used when we're lexing a variable that is embedded + * directly inside of a string with the # shorthand. + */ + PM_LEX_EMBVAR, + + /* This state is used when you are inside the content of a heredoc. */ + PM_LEX_HEREDOC, + + /* + * This state is used when we are lexing a list of tokens, as in a %w + * word list literal or a %i symbol list literal. + */ + PM_LEX_LIST, + + /* + * This state is used when a regular expression has been begun and we + * are looking for the terminator. + */ + PM_LEX_REGEXP, + + /* + * This state is used when we are lexing a string or a string-like + * token, as in string content with either quote or an xstring. + */ + PM_LEX_STRING + } mode; + + /* The data associated with this type of lex mode. */ + union { + struct { + /* This keeps track of the nesting level of the list. */ + size_t nesting; + + /* Whether or not interpolation is allowed in this list. */ + bool interpolation; + + /* + * When lexing a list, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* This is the terminator of the list literal. */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the list. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } list; + + struct { + /* + * This keeps track of the nesting level of the regular expression. + */ + size_t nesting; + + /* + * When lexing a regular expression, it takes into account balancing + * the terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* This is the terminator of the regular expression. */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the regular expression. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } regexp; + + struct { + /* This keeps track of the nesting level of the string. */ + size_t nesting; + + /* Whether or not interpolation is allowed in this string. */ + bool interpolation; + + /* + * Whether or not at the end of the string we should allow a :, + * which would indicate this was a dynamic symbol instead of a + * string. + */ + bool label_allowed; + + /* + * When lexing a string, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* + * This is the terminator of the string. It is typically either a + * single or double quote. + */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the string. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } string; + + struct { + /* + * All of the data necessary to lex a heredoc. + */ + pm_heredoc_lex_mode_t base; + + /* + * This is the pointer to the character where lexing should resume + * once the heredoc has been completely processed. + */ + const uint8_t *next_start; + + /* + * This is used to track the amount of common whitespace on each + * line so that we know how much to dedent each line in the case of + * a tilde heredoc. + */ + size_t *common_whitespace; + + /* True if the previous token ended with a line continuation. */ + bool line_continuation; + } heredoc; + } as; + + /* The previous lex state so that it knows how to pop. */ + struct pm_lex_mode *prev; +} pm_lex_mode_t; + +/* + * We pre-allocate a certain number of lex states in order to avoid having to + * call malloc too many times while parsing. You really shouldn't need more than + * this because you only really nest deeply when doing string interpolation. + */ +#define PM_LEX_STACK_SIZE 4 + +/* + * While parsing, we keep track of a stack of contexts. This is helpful for + * error recovery so that we can pop back to a previous context when we hit a + * token that is understood by a parent context but not by the current context. + */ +typedef enum { + /* a null context, used for returning a value from a function */ + PM_CONTEXT_NONE = 0, + + /* a begin statement */ + PM_CONTEXT_BEGIN, + + /* an ensure statement with an explicit begin */ + PM_CONTEXT_BEGIN_ENSURE, + + /* a rescue else statement with an explicit begin */ + PM_CONTEXT_BEGIN_ELSE, + + /* a rescue statement with an explicit begin */ + PM_CONTEXT_BEGIN_RESCUE, + + /* expressions in block arguments using braces */ + PM_CONTEXT_BLOCK_BRACES, + + /* expressions in block arguments using do..end */ + PM_CONTEXT_BLOCK_KEYWORDS, + + /* an ensure statement within a do..end block */ + PM_CONTEXT_BLOCK_ENSURE, + + /* a rescue else statement within a do..end block */ + PM_CONTEXT_BLOCK_ELSE, + + /* expressions in block parameters `foo do |...| end ` */ + PM_CONTEXT_BLOCK_PARAMETERS, + + /* a rescue statement within a do..end block */ + PM_CONTEXT_BLOCK_RESCUE, + + /* a case when statements */ + PM_CONTEXT_CASE_WHEN, + + /* a case in statements */ + PM_CONTEXT_CASE_IN, + + /* a class declaration */ + PM_CONTEXT_CLASS, + + /* an ensure statement within a class statement */ + PM_CONTEXT_CLASS_ENSURE, + + /* a rescue else statement within a class statement */ + PM_CONTEXT_CLASS_ELSE, + + /* a rescue statement within a class statement */ + PM_CONTEXT_CLASS_RESCUE, + + /* a method definition */ + PM_CONTEXT_DEF, + + /* an ensure statement within a method definition */ + PM_CONTEXT_DEF_ENSURE, + + /* a rescue else statement within a method definition */ + PM_CONTEXT_DEF_ELSE, + + /* a rescue statement within a method definition */ + PM_CONTEXT_DEF_RESCUE, + + /* a method definition's parameters */ + PM_CONTEXT_DEF_PARAMS, + + /* a defined? expression */ + PM_CONTEXT_DEFINED, + + /* a method definition's default parameter */ + PM_CONTEXT_DEFAULT_PARAMS, + + /* an else clause */ + PM_CONTEXT_ELSE, + + /* an elsif clause */ + PM_CONTEXT_ELSIF, + + /* an interpolated expression */ + PM_CONTEXT_EMBEXPR, + + /* a for loop */ + PM_CONTEXT_FOR, + + /* a for loop's index */ + PM_CONTEXT_FOR_INDEX, + + /* an if statement */ + PM_CONTEXT_IF, + + /* a lambda expression with braces */ + PM_CONTEXT_LAMBDA_BRACES, + + /* a lambda expression with do..end */ + PM_CONTEXT_LAMBDA_DO_END, + + /* an ensure statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ENSURE, + + /* a rescue else statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ELSE, + + /* a rescue statement within a lambda expression */ + PM_CONTEXT_LAMBDA_RESCUE, + + /* the predicate clause of a loop statement */ + PM_CONTEXT_LOOP_PREDICATE, + + /* the top level context */ + PM_CONTEXT_MAIN, + + /* a module declaration */ + PM_CONTEXT_MODULE, + + /* an ensure statement within a module statement */ + PM_CONTEXT_MODULE_ENSURE, + + /* a rescue else statement within a module statement */ + PM_CONTEXT_MODULE_ELSE, + + /* a rescue statement within a module statement */ + PM_CONTEXT_MODULE_RESCUE, + + /* a multiple target expression */ + PM_CONTEXT_MULTI_TARGET, + + /* a parenthesized expression */ + PM_CONTEXT_PARENS, + + /* an END block */ + PM_CONTEXT_POSTEXE, + + /* a predicate inside an if/elsif/unless statement */ + PM_CONTEXT_PREDICATE, + + /* a BEGIN block */ + PM_CONTEXT_PREEXE, + + /* a modifier rescue clause */ + PM_CONTEXT_RESCUE_MODIFIER, + + /* a singleton class definition */ + PM_CONTEXT_SCLASS, + + /* an ensure statement with a singleton class */ + PM_CONTEXT_SCLASS_ENSURE, + + /* a rescue else statement with a singleton class */ + PM_CONTEXT_SCLASS_ELSE, + + /* a rescue statement with a singleton class */ + PM_CONTEXT_SCLASS_RESCUE, + + /* a ternary expression */ + PM_CONTEXT_TERNARY, + + /* an unless statement */ + PM_CONTEXT_UNLESS, + + /* an until statement */ + PM_CONTEXT_UNTIL, + + /* a while statement */ + PM_CONTEXT_WHILE, +} pm_context_t; + +/* This is a node in a linked list of contexts. */ +typedef struct pm_context_node { + /* The context that this node represents. */ + pm_context_t context; + + /* A pointer to the previous context in the linked list. */ + struct pm_context_node *prev; +} pm_context_node_t; + +/* The type of shareable constant value that can be set. */ +typedef uint8_t pm_shareable_constant_value_t; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; + +/* + * This tracks an individual local variable in a certain lexical context, as + * well as the number of times is it read. + */ +typedef struct { + /* The name of the local variable. */ + pm_constant_id_t name; + + /* The location of the local variable in the source. */ + pm_location_t location; + + /* The index of the local variable in the local table. */ + uint32_t index; + + /* The number of times the local variable is read. */ + uint32_t reads; + + /* The hash of the local variable. */ + uint32_t hash; +} pm_local_t; + +/* + * This is a set of local variables in a certain lexical context (method, class, + * module, etc.). We need to track how many times these variables are read in + * order to warn if they only get written. + */ +typedef struct pm_locals { + /* The number of local variables in the set. */ + uint32_t size; + + /* The capacity of the local variables set. */ + uint32_t capacity; + + /* + * A bloom filter over constant IDs stored in this set. Used to quickly + * reject lookups for names that are definitely not present, avoiding the + * cost of a linear scan or hash probe. + */ + uint32_t bloom; + + /* The nullable allocated memory for the local variables in the set. */ + pm_local_t *locals; +} pm_locals_t; + +/* The flags about scope parameters that can be set. */ +typedef uint8_t pm_scope_parameters_t; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; + +/* + * This struct represents a node in a linked list of scopes. Some scopes can see + * into their parent scopes, while others cannot. + */ +typedef struct pm_scope { + /* A pointer to the previous scope in the linked list. */ + struct pm_scope *previous; + + /* The IDs of the locals in the given scope. */ + pm_locals_t locals; + + /* + * This is a list of the implicit parameters contained within the block. + * These will be processed after the block is parsed to determine the kind + * of parameters node that should be used and to check if any errors need to + * be added. + */ + pm_node_list_t implicit_parameters; + + /* + * This is a bitfield that indicates the parameters that are being used in + * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. + * There are three different kinds of parameters that can be used in a + * scope: + * + * - Ordinary parameters (e.g., def foo(bar); end) + * - Numbered parameters (e.g., def foo; _1; end) + * - The it parameter (e.g., def foo; it; end) + * + * If ordinary parameters are being used, then certain parameters can be + * forwarded to another method/structure. Those are indicated by four + * additional bits in the params field. For example, some combinations of: + * + * - def foo(*); end + * - def foo(**); end + * - def foo(&); end + * - def foo(...); end + */ + pm_scope_parameters_t parameters; + + /* + * The current state of constant shareability for this scope. This is + * changed by magic shareable_constant_value comments. + */ + pm_shareable_constant_value_t shareable_constant; + + /* + * A boolean indicating whether or not this scope can see into its parent. + * If closed is true, then the scope cannot see into its parent. + */ + bool closed; +} pm_scope_t; + +/* + * A struct that represents a stack of boolean values. + */ +typedef uint32_t pm_state_stack_t; + +/* + * This struct represents the overall parser. It contains a reference to the + * source file, as well as pointers that indicate where in the source it's + * currently parsing. It also contains the most recent and current token that + * it's considering. + */ +struct pm_parser_t { + /* The arena used for all AST-lifetime allocations. Caller-owned. */ + pm_arena_t *arena; + + /* The arena used for parser metadata (comments, diagnostics, etc.). */ + pm_arena_t metadata_arena; + + /* + * The next node identifier that will be assigned. This is a unique + * identifier used to track nodes such that the syntax tree can be dropped + * but the node can be found through another parse. + */ + uint32_t node_id; + + /* + * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant + * constant pool lookups when the same token is resolved multiple times + * (e.g., once during lexing for local variable detection, and again + * during parsing for node creation). + */ + struct { + const uint8_t *start; + const uint8_t *end; + pm_constant_id_t id; + } constant_cache; + + /* The current state of the lexer. */ + pm_lex_state_t lex_state; + + /* Tracks the current nesting of (), [], and {}. */ + int enclosure_nesting; + + /* + * Used to temporarily track the nesting of enclosures to determine if a { + * is the beginning of a lambda following the parameters of a lambda. + */ + int lambda_enclosure_nesting; + + /* + * Used to track the nesting of braces to ensure we get the correct value + * when we are interpolating blocks with braces. + */ + int brace_nesting; + + /* + * The stack used to determine if a do keyword belongs to the predicate of a + * while, until, or for loop. + */ + pm_state_stack_t do_loop_stack; + + /* + * The stack used to determine if a do keyword belongs to the beginning of a + * block. + */ + pm_state_stack_t accepts_block_stack; + + /* A stack of lex modes. */ + struct { + /* The current mode of the lexer. */ + pm_lex_mode_t *current; + + /* The stack of lexer modes. */ + pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; + + /* The current index into the lexer mode stack. */ + size_t index; + } lex_modes; + + /* The pointer to the start of the source. */ + const uint8_t *start; + + /* The pointer to the end of the source. */ + const uint8_t *end; + + /* The previous token we were considering. */ + pm_token_t previous; + + /* The current token we're considering. */ + pm_token_t current; + + /* + * This is a special field set on the parser when we need the parser to jump + * to a specific location when lexing the next token, as opposed to just + * using the end of the previous token. Normally this is NULL. + */ + const uint8_t *next_start; + + /* + * This field indicates the end of a heredoc whose identifier was found on + * the current line. If another heredoc is found on the same line, then this + * will be moved forward to the end of that heredoc. If no heredocs are + * found on a line then this is NULL. + */ + const uint8_t *heredoc_end; + + /* The list of comments that have been found while parsing. */ + pm_list_t comment_list; + + /* The list of magic comments that have been found while parsing. */ + pm_list_t magic_comment_list; + + /* + * An optional location that represents the location of the __END__ marker + * and the rest of the content of the file. This content is loaded into the + * DATA constant when the file being parsed is the main file being executed. + */ + pm_location_t data_loc; + + /* The list of warnings that have been found while parsing. */ + pm_list_t warning_list; + + /* The list of errors that have been found while parsing. */ + pm_list_t error_list; + + /* The current local scope. */ + pm_scope_t *current_scope; + + /* The current parsing context. */ + pm_context_node_t *current_context; + + /* + * The hash keys for the hash that is currently being parsed. This is not + * usually necessary because it can pass it down the various call chains, + * but in the event that you're parsing a hash that is being directly + * pushed into another hash with **, we need to share the hash keys so that + * we can warn for the nested hash as well. + */ + pm_static_literals_t *current_hash_keys; + + /* + * The encoding functions for the current file is attached to the parser as + * it's parsing so that it can change with a magic comment. + */ + const pm_encoding_t *encoding; + + /* + * When the encoding that is being used to parse the source is changed by + * prism, we provide the ability here to call out to a user-defined + * function. + */ + pm_encoding_changed_callback_t encoding_changed_callback; + + /* + * This pointer indicates where a comment must start if it is to be + * considered an encoding comment. + */ + const uint8_t *encoding_comment_start; + + /* + * When you are lexing through a file, the lexer needs all of the information + * that the parser additionally provides (for example, the local table). So if + * you want to properly lex Ruby, you need to actually lex it in the context of + * the parser. In order to provide this functionality, we optionally allow a + * struct to be attached to the parser that calls back out to a user-provided + * callback when each token is lexed. + */ + struct { + /* + * This is the callback that is called when a token is lexed. It is + * passed the opaque data pointer, the parser, and the token that was + * lexed. + */ + pm_lex_callback_t callback; + + /* + * This opaque pointer is used to provide whatever information the user + * deemed necessary to the callback. In our case we use it to pass the + * array that the tokens get appended into. + */ + void *data; + } lex_callback; + + /* + * This is the path of the file being parsed. We use the filepath when + * constructing SourceFileNodes. + */ + pm_string_t filepath; + + /* + * This constant pool keeps all of the constants defined throughout the file + * so that we can reference them later. + */ + pm_constant_pool_t constant_pool; + + /* This is the list of line offsets in the source file. */ + pm_line_offset_list_t line_offsets; + + /* + * State communicated from the lexer to the parser for integer tokens. + */ + struct { + /* + * A flag indicating the base of the integer (binary, octal, decimal, + * hexadecimal). Set during lexing and read during node creation. + */ + pm_node_flags_t base; + + /* + * When lexing a decimal integer that fits in a uint32_t, we compute + * the value during lexing to avoid re-scanning the digits during + * parsing. If lexed is true, this holds the result and + * pm_integer_parse can be skipped. + */ + uint32_t value; + + /* Whether value holds a valid pre-computed integer. */ + bool lexed; + } integer; + + /* + * This string is used to pass information from the lexer to the parser. It + * is particularly necessary because of escape sequences. + */ + pm_string_t current_string; + + /* + * The line number at the start of the parse. This will be used to offset + * the line numbers of all of the locations. + */ + int32_t start_line; + + /* + * When a string-like expression is being lexed, any byte or escape sequence + * that resolves to a value whose top bit is set (i.e., >= 0x80) will + * explicitly set the encoding to the same encoding as the source. + * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that + * resolves to a value whose top bit is set, then the encoding will be + * explicitly set to UTF-8. + * + * The _next_ time this happens, if the encoding that is about to become the + * explicitly set encoding does not match the previously set explicit + * encoding, a mixed encoding error will be emitted. + * + * When the expression is finished being lexed, the explicit encoding + * controls the encoding of the expression. For the most part this means + * that the expression will either be encoded in the source encoding or + * UTF-8. This holds for all encodings except US-ASCII. If the source is + * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the + * expression will be encoded as ASCII-8BIT. + * + * Note that if the expression is a list, different elements within the same + * list can have different encodings, so this will get reset between each + * element. Furthermore all of this only applies to lists that support + * interpolation, because otherwise escapes that could change the encoding + * are ignored. + * + * At first glance, it may make more sense for this to live on the lexer + * mode, but we need it here to communicate back to the parser for character + * literals that do not push a new lexer mode. + */ + const pm_encoding_t *explicit_encoding; + + /* + * When parsing block exits (e.g., break, next, redo), we need to validate + * that they are in correct contexts. For the most part we can do this by + * looking at our parent contexts. However, modifier while and until + * expressions can change that context to make block exits valid. In these + * cases, we need to keep track of the block exits and then validate them + * after the expression has been parsed. + * + * We use a pointer here because we don't want to keep a whole list attached + * since this will only be used in the context of begin/end expressions. + */ + pm_node_list_t *current_block_exits; + + /* The version of prism that we should use to parse. */ + pm_options_version_t version; + + /* The command line flags given from the options. */ + uint8_t command_line; + + /* + * Whether or not we have found a frozen_string_literal magic comment with + * a true or false value. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /* + * Whether or not we are parsing an eval string. This impacts whether or not + * we should evaluate if block exits/yields are valid. + */ + bool parsing_eval; + + /* + * Whether or not we are parsing a "partial" script, which is a script that + * will be evaluated in the context of another script, so we should not + * check jumps (next/break/etc.) for validity. + */ + bool partial_script; + + /* Whether or not we're at the beginning of a command. */ + bool command_start; + + /* + * Whether or not we're currently parsing the body of an endless method + * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be + * consumed by commands (it should bubble up to the outer context). + */ + bool in_endless_def_body; + + /* Whether or not we're currently recovering from a syntax error. */ + bool recovering; + + /* + * Whether or not the source being parsed could become valid if more input + * were appended. This is set to false when the parser encounters a token + * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to + * merely incomplete. + */ + bool continuable; + + /* + * This is very specialized behavior for when you want to parse in a context + * that does not respect encoding comments. Its main use case is translating + * into the whitequark/parser AST which re-encodes source files in UTF-8 + * before they are parsed and ignores encoding comments. + */ + bool encoding_locked; + + /* + * Whether or not the encoding has been changed by a magic comment. We use + * this to provide a fast path for the lexer instead of going through the + * function pointer. + */ + bool encoding_changed; + + /* + * This flag indicates that we are currently parsing a pattern matching + * expression and impacts that calculation of newlines. + */ + bool pattern_matching_newlines; + + /* This flag indicates that we are currently parsing a keyword argument. */ + bool in_keyword_arg; + + /* + * Whether or not the parser has seen a token that has semantic meaning + * (i.e., a token that is not a comment or whitespace). + */ + bool semantic_token_seen; + + /* + * By default, Ruby always warns about mismatched indentation. This can be + * toggled with a magic comment. + */ + bool warn_mismatched_indentation; + +#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR) + /* + * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding + * the nibble-based tables on every call when the charset hasn't changed + * (which is the common case during string/regex/list lexing). + */ + struct { + /* The cached charset (null-terminated, max 11 chars + NUL). */ + uint8_t charset[12]; + + /* Nibble-based low lookup table for SIMD matching. */ + uint8_t low_lut[16]; + + /* Nibble-based high lookup table for SIMD matching. */ + uint8_t high_lut[16]; + + /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */ + uint64_t table[4]; + } strpbrk_cache; +#endif +}; + +/* + * Initialize a parser with the given start and end pointers. + */ +void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options); + +/* + * Free the memory held by the given parser. + * + * This does not free the `pm_options_t` object that was used to initialize the + * parser. + */ +void pm_parser_cleanup(pm_parser_t *parser); + +#endif diff --git a/prism/internal/regexp.h b/prism/internal/regexp.h new file mode 100644 index 0000000000..3710c984fc --- /dev/null +++ b/prism/internal/regexp.h @@ -0,0 +1,41 @@ +#ifndef PRISM_INTERNAL_REGEXP_H +#define PRISM_INTERNAL_REGEXP_H + +#include "prism/ast.h" +#include "prism/parser.h" + +/* + * Accumulation state for named capture groups found during regexp parsing. + * The caller initializes this with the call node and passes it to + * pm_regexp_parse. The regexp parser populates match and names as groups + * are found. + */ +typedef struct { + /* The call node wrapping the regular expression node (for =~). */ + pm_call_node_t *call; + + /* The match write node being built, or NULL if no captures found yet. */ + pm_match_write_node_t *match; + + /* The list of capture names found so far (for deduplication). */ + pm_constant_id_list_t names; +} pm_regexp_name_data_t; + +/* + * Callback invoked by pm_regexp_parse() for each named capture group found. + */ +typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data); + +/* + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Returns the encoding flags to set on the node. + */ +PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); + +/* + * Parse an interpolated regular expression for named capture groups only. + * No encoding validation is performed. + */ +void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); + +#endif diff --git a/prism/internal/serialize.h b/prism/internal/serialize.h new file mode 100644 index 0000000000..e611a0374b --- /dev/null +++ b/prism/internal/serialize.h @@ -0,0 +1,34 @@ +#ifndef PRISM_INTERNAL_SERIALIZE_H +#define PRISM_INTERNAL_SERIALIZE_H + +#include "prism/internal/encoding.h" +#include "prism/internal/list.h" + +#include "prism/ast.h" +#include "prism/buffer.h" +#include "prism/excludes.h" +#include "prism/parser.h" + +/* We optionally support serializing to a binary string. For systems that do not + * want or need this functionality, it can be turned off with the + * PRISM_EXCLUDE_SERIALIZATION define. */ +#ifndef PRISM_EXCLUDE_SERIALIZATION + +/* + * Serialize the given list of comments to the given buffer. + */ +void pm_serialize_comment_list(pm_list_t *list, pm_buffer_t *buffer); + +/* + * Serialize the name of the encoding to the buffer. + */ +void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer); + +/* + * Serialize the encoding, metadata, nodes, and constant pool. + */ +void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); + +#endif + +#endif diff --git a/prism/internal/source.h b/prism/internal/source.h new file mode 100644 index 0000000000..b3c2b55be3 --- /dev/null +++ b/prism/internal/source.h @@ -0,0 +1,72 @@ +#ifndef PRISM_INTERNAL_SOURCE_H +#define PRISM_INTERNAL_SOURCE_H + +#include "prism/source.h" +#include "prism/buffer.h" + +#include <stdbool.h> + +/* + * The type of source, which determines cleanup behavior. + */ +typedef enum { + /* Wraps existing constant memory, no cleanup. */ + PM_SOURCE_CONSTANT, + + /* Wraps existing shared memory (non-owning slice), no cleanup. */ + PM_SOURCE_SHARED, + + /* Owns a heap-allocated buffer, freed on cleanup. */ + PM_SOURCE_OWNED, + + /* Memory-mapped file, unmapped on cleanup. */ + PM_SOURCE_MAPPED, + + /* Stream source backed by a pm_buffer_t. */ + PM_SOURCE_STREAM +} pm_source_type_t; + +/* + * The internal representation of a source. + */ +struct pm_source_t { + /* A pointer to the start of the source data. */ + const uint8_t *source; + + /* The length of the source data in bytes. */ + size_t length; + + /* The type of the source. */ + pm_source_type_t type; + + /* Stream-specific data, only used for PM_SOURCE_STREAM sources. */ + struct { + /* The buffer that holds the accumulated stream data. */ + pm_buffer_t *buffer; + + /* The stream object to read from. */ + void *stream; + + /* The function to use to read from the stream. */ + pm_source_stream_fgets_t *fgets; + + /* The function to use to check if the stream is at EOF. */ + pm_source_stream_feof_t *feof; + + /* Whether the stream has reached EOF. */ + bool eof; + } stream; +}; + +/* + * Read from a stream into the source's internal buffer. This is used by + * pm_parse_stream to incrementally read the source. + */ +bool pm_source_stream_read(pm_source_t *source); + +/* + * Returns whether the stream source has reached EOF. + */ +bool pm_source_stream_eof(const pm_source_t *source); + +#endif diff --git a/prism/internal/static_literals.h b/prism/internal/static_literals.h new file mode 100644 index 0000000000..d59002ac0a --- /dev/null +++ b/prism/internal/static_literals.h @@ -0,0 +1,98 @@ +#ifndef PRISM_INTERNAL_STATIC_LITERALS_H +#define PRISM_INTERNAL_STATIC_LITERALS_H + +#include "prism/ast.h" +#include "prism/buffer.h" +#include "prism/line_offset_list.h" + +/* + * An internal hash table for a set of nodes. + */ +typedef struct { + /* The array of nodes in the hash table. */ + pm_node_t **nodes; + + /* The size of the hash table. */ + uint32_t size; + + /* The space that has been allocated in the hash table. */ + uint32_t capacity; +} pm_node_hash_t; + +/* + * Certain sets of nodes (hash keys and when clauses) check for duplicate nodes + * to alert the user of potential issues. To do this, we keep a set of the nodes + * that have been seen so far, and compare whenever we find a new node. + * + * We bucket the nodes based on their type to minimize the number of comparisons + * that need to be performed. + */ +typedef struct { + /* + * This is the set of IntegerNode and SourceLineNode instances. + */ + pm_node_hash_t integer_nodes; + + /* + * This is the set of FloatNode instances. + */ + pm_node_hash_t float_nodes; + + /* + * This is the set of RationalNode and ImaginaryNode instances. + */ + pm_node_hash_t number_nodes; + + /* + * This is the set of StringNode and SourceFileNode instances. + */ + pm_node_hash_t string_nodes; + + /* + * This is the set of RegularExpressionNode instances. + */ + pm_node_hash_t regexp_nodes; + + /* + * This is the set of SymbolNode instances. + */ + pm_node_hash_t symbol_nodes; + + /* + * A pointer to the last TrueNode instance that was inserted, or NULL. + */ + pm_node_t *true_node; + + /* + * A pointer to the last FalseNode instance that was inserted, or NULL. + */ + pm_node_t *false_node; + + /* + * A pointer to the last NilNode instance that was inserted, or NULL. + */ + pm_node_t *nil_node; + + /* + * A pointer to the last SourceEncodingNode instance that was inserted, or + * NULL. + */ + pm_node_t *source_encoding_node; +} pm_static_literals_t; + +/* + * Add a node to the set of static literals. + */ +pm_node_t * pm_static_literals_add(const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace); + +/* + * Free the internal memory associated with the given static literals set. + */ +void pm_static_literals_free(pm_static_literals_t *literals); + +/* + * Create a string-based representation of the given static literal. + */ +void pm_static_literal_inspect(pm_buffer_t *buffer, const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, const char *encoding_name, const pm_node_t *node); + +#endif diff --git a/prism/internal/stringy.h b/prism/internal/stringy.h new file mode 100644 index 0000000000..1aaa23ea75 --- /dev/null +++ b/prism/internal/stringy.h @@ -0,0 +1,30 @@ +#ifndef PRISM_INTERNAL_STRINGY_H +#define PRISM_INTERNAL_STRINGY_H + +#include "prism/stringy.h" + +/* + * Defines an empty string. This is useful for initializing a string that will + * be filled in later. + */ +#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 }) + +/* + * Initialize a shared string that is based on initial input. + */ +void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end); + +/* + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + */ +int pm_string_compare(const pm_string_t *left, const pm_string_t *right); + +/* + * Free the associated memory of the given string. + */ +void pm_string_cleanup(pm_string_t *string); + +#endif diff --git a/prism/internal/strncasecmp.h b/prism/internal/strncasecmp.h new file mode 100644 index 0000000000..775f6a993e --- /dev/null +++ b/prism/internal/strncasecmp.h @@ -0,0 +1,18 @@ +#ifndef PRISM_INTERNAL_STRNCASECMP_H +#define PRISM_INTERNAL_STRNCASECMP_H + +#include <stddef.h> +#include <stdint.h> + +/* + * Compare two strings, ignoring case, up to the given length. Returns 0 if the + * strings are equal, a negative number if string1 is less than string2, or a + * positive number if string1 is greater than string2. + * + * Note that this is effectively our own implementation of strncasecmp, but it's + * not available on all of the platforms we want to support so we're rolling it + * here. + */ +int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length); + +#endif diff --git a/prism/internal/strpbrk.h b/prism/internal/strpbrk.h new file mode 100644 index 0000000000..d64156c002 --- /dev/null +++ b/prism/internal/strpbrk.h @@ -0,0 +1,33 @@ +#ifndef PRISM_INTERNAL_STRPBRK_H +#define PRISM_INTERNAL_STRPBRK_H + +#include "prism/parser.h" + +/* The maximum number of bytes in a strpbrk charset. */ +#define PM_STRPBRK_CACHE_SIZE 16 + +#include <stddef.h> +#include <stdint.h> + +/* + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift-JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + */ +const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate); + +#endif diff --git a/prism/internal/tokens.h b/prism/internal/tokens.h new file mode 100644 index 0000000000..3a983e54ae --- /dev/null +++ b/prism/internal/tokens.h @@ -0,0 +1,11 @@ +#ifndef PRISM_INTERNAL_TOKENS_H +#define PRISM_INTERNAL_TOKENS_H + +#include "prism/ast.h" + +/* + * Returns the human name of the given token type. + */ +const char * pm_token_str(pm_token_type_t token_type); + +#endif |
