diff options
Diffstat (limited to 'prism')
120 files changed, 16143 insertions, 13290 deletions
diff --git a/prism/api_pack.c b/prism/api_pack.c deleted file mode 100644 index 98509ae65c..0000000000 --- a/prism/api_pack.c +++ /dev/null @@ -1,276 +0,0 @@ -#include "prism/extension.h" - -#ifdef PRISM_EXCLUDE_PACK - -void -Init_prism_pack(void) {} - -#else - -static VALUE rb_cPrism; -static VALUE rb_cPrismPack; -static VALUE rb_cPrismPackDirective; -static VALUE rb_cPrismPackFormat; - -static VALUE v3_2_0_symbol; -static VALUE pack_symbol; -static VALUE unpack_symbol; - -#if SIZEOF_UINT64_T == SIZEOF_LONG_LONG -# define UINT64T2NUM(x) ULL2NUM(x) -# define NUM2UINT64T(x) (uint64_t)NUM2ULL(x) -#elif SIZEOF_UINT64_T == SIZEOF_LONG -# define UINT64T2NUM(x) ULONG2NUM(x) -# define NUM2UINT64T(x) (uint64_t)NUM2ULONG(x) -#else -// error No uint64_t conversion -#endif - -static VALUE -pack_type_to_symbol(pm_pack_type type) { - switch (type) { - case PM_PACK_SPACE: - return ID2SYM(rb_intern("SPACE")); - case PM_PACK_COMMENT: - return ID2SYM(rb_intern("COMMENT")); - case PM_PACK_INTEGER: - return ID2SYM(rb_intern("INTEGER")); - case PM_PACK_UTF8: - return ID2SYM(rb_intern("UTF8")); - case PM_PACK_BER: - return ID2SYM(rb_intern("BER")); - case PM_PACK_FLOAT: - return ID2SYM(rb_intern("FLOAT")); - case PM_PACK_STRING_SPACE_PADDED: - return ID2SYM(rb_intern("STRING_SPACE_PADDED")); - case PM_PACK_STRING_NULL_PADDED: - return ID2SYM(rb_intern("STRING_NULL_PADDED")); - case PM_PACK_STRING_NULL_TERMINATED: - return ID2SYM(rb_intern("STRING_NULL_TERMINATED")); - case PM_PACK_STRING_MSB: - return ID2SYM(rb_intern("STRING_MSB")); - case PM_PACK_STRING_LSB: - return ID2SYM(rb_intern("STRING_LSB")); - case PM_PACK_STRING_HEX_HIGH: - return ID2SYM(rb_intern("STRING_HEX_HIGH")); - case PM_PACK_STRING_HEX_LOW: - return ID2SYM(rb_intern("STRING_HEX_LOW")); - case PM_PACK_STRING_UU: - return ID2SYM(rb_intern("STRING_UU")); - case PM_PACK_STRING_MIME: - return ID2SYM(rb_intern("STRING_MIME")); - case PM_PACK_STRING_BASE64: - return ID2SYM(rb_intern("STRING_BASE64")); - case PM_PACK_STRING_FIXED: - return ID2SYM(rb_intern("STRING_FIXED")); - case PM_PACK_STRING_POINTER: - return ID2SYM(rb_intern("STRING_POINTER")); - case PM_PACK_MOVE: - return ID2SYM(rb_intern("MOVE")); - case PM_PACK_BACK: - return ID2SYM(rb_intern("BACK")); - case PM_PACK_NULL: - return ID2SYM(rb_intern("NULL")); - default: - return Qnil; - } -} - -static VALUE -pack_signed_to_symbol(pm_pack_signed signed_type) { - switch (signed_type) { - case PM_PACK_UNSIGNED: - return ID2SYM(rb_intern("UNSIGNED")); - case PM_PACK_SIGNED: - return ID2SYM(rb_intern("SIGNED")); - case PM_PACK_SIGNED_NA: - return ID2SYM(rb_intern("SIGNED_NA")); - default: - return Qnil; - } -} - -static VALUE -pack_endian_to_symbol(pm_pack_endian endian) { - switch (endian) { - case PM_PACK_AGNOSTIC_ENDIAN: - return ID2SYM(rb_intern("AGNOSTIC_ENDIAN")); - case PM_PACK_LITTLE_ENDIAN: - return ID2SYM(rb_intern("LITTLE_ENDIAN")); - case PM_PACK_BIG_ENDIAN: - return ID2SYM(rb_intern("BIG_ENDIAN")); - case PM_PACK_NATIVE_ENDIAN: - return ID2SYM(rb_intern("NATIVE_ENDIAN")); - case PM_PACK_ENDIAN_NA: - return ID2SYM(rb_intern("ENDIAN_NA")); - default: - return Qnil; - } -} - -static VALUE -pack_size_to_symbol(pm_pack_size size) { - switch (size) { - case PM_PACK_SIZE_SHORT: - return ID2SYM(rb_intern("SIZE_SHORT")); - case PM_PACK_SIZE_INT: - return ID2SYM(rb_intern("SIZE_INT")); - case PM_PACK_SIZE_LONG: - return ID2SYM(rb_intern("SIZE_LONG")); - case PM_PACK_SIZE_LONG_LONG: - return ID2SYM(rb_intern("SIZE_LONG_LONG")); - case PM_PACK_SIZE_8: - return ID2SYM(rb_intern("SIZE_8")); - case PM_PACK_SIZE_16: - return ID2SYM(rb_intern("SIZE_16")); - case PM_PACK_SIZE_32: - return ID2SYM(rb_intern("SIZE_32")); - case PM_PACK_SIZE_64: - return ID2SYM(rb_intern("SIZE_64")); - case PM_PACK_SIZE_P: - return ID2SYM(rb_intern("SIZE_P")); - case PM_PACK_SIZE_NA: - return ID2SYM(rb_intern("SIZE_NA")); - default: - return Qnil; - } -} - -static VALUE -pack_length_type_to_symbol(pm_pack_length_type length_type) { - switch (length_type) { - case PM_PACK_LENGTH_FIXED: - return ID2SYM(rb_intern("LENGTH_FIXED")); - case PM_PACK_LENGTH_MAX: - return ID2SYM(rb_intern("LENGTH_MAX")); - case PM_PACK_LENGTH_RELATIVE: - return ID2SYM(rb_intern("LENGTH_RELATIVE")); - case PM_PACK_LENGTH_NA: - return ID2SYM(rb_intern("LENGTH_NA")); - default: - return Qnil; - } -} - -static VALUE -pack_encoding_to_ruby(pm_pack_encoding encoding) { - int index; - switch (encoding) { - case PM_PACK_ENCODING_ASCII_8BIT: - index = rb_ascii8bit_encindex(); - break; - case PM_PACK_ENCODING_US_ASCII: - index = rb_usascii_encindex(); - break; - case PM_PACK_ENCODING_UTF_8: - index = rb_utf8_encindex(); - break; - default: - return Qnil; - } - return rb_enc_from_encoding(rb_enc_from_index(index)); -} - -/** - * call-seq: - * Pack::parse(version, variant, source) -> Format - * - * Parse the given source and return a format object. - */ -static VALUE -pack_parse(VALUE self, VALUE version_symbol, VALUE variant_symbol, VALUE format_string) { - if (version_symbol != v3_2_0_symbol) { - rb_raise(rb_eArgError, "invalid version"); - } - - pm_pack_variant variant; - if (variant_symbol == pack_symbol) { - variant = PM_PACK_VARIANT_PACK; - } else if (variant_symbol == unpack_symbol) { - variant = PM_PACK_VARIANT_UNPACK; - } else { - rb_raise(rb_eArgError, "invalid variant"); - } - - StringValue(format_string); - - const char *format = RSTRING_PTR(format_string); - const char *format_end = format + RSTRING_LEN(format_string); - pm_pack_encoding encoding = PM_PACK_ENCODING_START; - - VALUE directives_array = rb_ary_new(); - - while (format < format_end) { - pm_pack_type type; - pm_pack_signed signed_type; - pm_pack_endian endian; - pm_pack_size size; - pm_pack_length_type length_type; - uint64_t length; - - const char *directive_start = format; - - pm_pack_result parse_result = pm_pack_parse(variant, &format, format_end, &type, &signed_type, &endian, - &size, &length_type, &length, &encoding); - - const char *directive_end = format; - - switch (parse_result) { - case PM_PACK_OK: - break; - case PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE: - rb_raise(rb_eArgError, "unsupported directive"); - case PM_PACK_ERROR_UNKNOWN_DIRECTIVE: - rb_raise(rb_eArgError, "unsupported directive"); - case PM_PACK_ERROR_LENGTH_TOO_BIG: - rb_raise(rb_eRangeError, "pack length too big"); - case PM_PACK_ERROR_BANG_NOT_ALLOWED: - rb_raise(rb_eRangeError, "bang not allowed"); - case PM_PACK_ERROR_DOUBLE_ENDIAN: - rb_raise(rb_eRangeError, "double endian"); - default: - rb_bug("parse result"); - } - - if (type == PM_PACK_END) { - break; - } - - VALUE directive_args[9] = { - version_symbol, - variant_symbol, - rb_usascii_str_new(directive_start, directive_end - directive_start), - pack_type_to_symbol(type), - pack_signed_to_symbol(signed_type), - pack_endian_to_symbol(endian), - pack_size_to_symbol(size), - pack_length_type_to_symbol(length_type), - UINT64T2NUM(length) - }; - - rb_ary_push(directives_array, rb_class_new_instance(9, directive_args, rb_cPrismPackDirective)); - } - - VALUE format_args[2]; - format_args[0] = directives_array; - format_args[1] = pack_encoding_to_ruby(encoding); - return rb_class_new_instance(2, format_args, rb_cPrismPackFormat); -} - -/** - * The function that gets called when Ruby initializes the prism extension. - */ -void -Init_prism_pack(void) { - rb_cPrism = rb_define_module("Prism"); - rb_cPrismPack = rb_define_module_under(rb_cPrism, "Pack"); - rb_cPrismPackDirective = rb_define_class_under(rb_cPrismPack, "Directive", rb_cObject); - rb_cPrismPackFormat = rb_define_class_under(rb_cPrismPack, "Format", rb_cObject); - rb_define_singleton_method(rb_cPrismPack, "parse", pack_parse, 3); - - v3_2_0_symbol = ID2SYM(rb_intern("v3_2_0")); - pack_symbol = ID2SYM(rb_intern("pack")); - unpack_symbol = ID2SYM(rb_intern("unpack")); -} - -#endif diff --git a/prism/arena.c b/prism/arena.c new file mode 100644 index 0000000000..64a731649d --- /dev/null +++ b/prism/arena.c @@ -0,0 +1,117 @@ +#include "prism/internal/arena.h" + +#include "prism/internal/allocator.h" + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> + +/** + * Compute the block allocation size using offsetof so it is correct regardless + * of PM_FLEX_ARRAY_LENGTH. + */ +#define PM_ARENA_BLOCK_SIZE(data_size) (offsetof(pm_arena_block_t, data) + (data_size)) + +/** Initial block data size: 8 KB. */ +#define PM_ARENA_INITIAL_SIZE 8192 + +/** Double the block size every this many blocks. */ +#define PM_ARENA_GROWTH_INTERVAL 8 + +/** Maximum block data size: 1 MB. */ +#define PM_ARENA_MAX_SIZE (1024 * 1024) + +/** + * Compute the data size for the next block. + */ +static size_t +pm_arena_next_block_size(const pm_arena_t *arena, size_t min_size) { + size_t size = PM_ARENA_INITIAL_SIZE; + + for (size_t exp = PM_ARENA_GROWTH_INTERVAL; exp <= arena->block_count; exp += PM_ARENA_GROWTH_INTERVAL) { + if (size < PM_ARENA_MAX_SIZE) size *= 2; + } + + return size > min_size ? size : min_size; +} + +/** + * Allocate a new block with the given data capacity and initial usage, link it + * into the arena, and return it. Aborts on allocation failure. + */ +static pm_arena_block_t * +pm_arena_block_new(pm_arena_t *arena, size_t data_size, size_t initial_used) { + assert(initial_used <= data_size); + pm_arena_block_t *block = (pm_arena_block_t *) xmalloc(PM_ARENA_BLOCK_SIZE(data_size)); + + if (block == NULL) { + fprintf(stderr, "prism: out of memory; aborting\n"); + abort(); + } + + block->capacity = data_size; + block->used = initial_used; + block->prev = arena->current; + arena->current = block; + arena->block_count++; + + return block; +} + +/** + * Ensure the arena has at least `capacity` bytes available in its current + * block, allocating a new block if necessary. This allows callers to + * pre-size the arena to avoid repeated small block allocations. + */ +void +pm_arena_reserve(pm_arena_t *arena, size_t capacity) { + if (capacity <= PM_ARENA_INITIAL_SIZE) return; + if (arena->current != NULL && (arena->current->capacity - arena->current->used) >= capacity) return; + pm_arena_block_new(arena, capacity, 0); +} + +/** + * Slow path for pm_arena_alloc: allocate a new block and return a pointer to + * the first `size` bytes. Called when the current block has insufficient space. + */ +void * +pm_arena_alloc_slow(pm_arena_t *arena, size_t size) { + size_t block_data_size = pm_arena_next_block_size(arena, size); + pm_arena_block_t *block = pm_arena_block_new(arena, block_data_size, size); + return block->data; +} + +/** + * Returns a newly allocated and initialized arena. + */ +pm_arena_t * +pm_arena_new(void) { + pm_arena_t *arena = (pm_arena_t *) xcalloc(1, sizeof(pm_arena_t)); + if (arena == NULL) abort(); + return arena; +} + +/** + * Free all blocks in the arena. + */ +void +pm_arena_cleanup(pm_arena_t *arena) { + pm_arena_block_t *block = arena->current; + + while (block != NULL) { + pm_arena_block_t *prev = block->prev; + xfree_sized(block, PM_ARENA_BLOCK_SIZE(block->capacity)); + block = prev; + } + + *arena = (pm_arena_t) { 0 }; +} + +/** + * Frees both the held memory and the arena itself. + */ +void +pm_arena_free(pm_arena_t *arena) { + pm_arena_cleanup(arena); + xfree_sized(arena, sizeof(pm_arena_t)); +} diff --git a/prism/arena.h b/prism/arena.h new file mode 100644 index 0000000000..e1fa8fc6ad --- /dev/null +++ b/prism/arena.h @@ -0,0 +1,37 @@ +/** + * @file arena.h + * + * A bump allocator for the prism parser. + */ +#ifndef PRISM_ARENA_H +#define PRISM_ARENA_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> + +/** + * An opaque pointer to an arena that is used for allocations. + */ +typedef struct pm_arena_t pm_arena_t; + +/** + * Returns a newly allocated and initialized arena. If the arena cannot be + * allocated, this function aborts the process. + * + * @returns A pointer to the newly allocated arena. It is the responsibility of + * the caller to free the arena using pm_arena_free when it is no longer + * needed. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_arena_t * pm_arena_new(void); + +/** + * Frees both the held memory and the arena itself. + * + * @param arena The arena to free. + */ +PRISM_EXPORTED_FUNCTION void pm_arena_free(pm_arena_t *arena) PRISM_NONNULL(1); + +#endif diff --git a/prism/util/pm_buffer.c b/prism/buffer.c index 2136a7c43e..cb3b9a4fe8 100644 --- a/prism/util/pm_buffer.c +++ b/prism/buffer.c @@ -1,31 +1,38 @@ -#include "prism/util/pm_buffer.h" +#include "prism/internal/buffer.h" -/** - * Return the size of the pm_buffer_t struct. - */ -size_t -pm_buffer_sizeof(void) { - return sizeof(pm_buffer_t); -} +#include "prism/compiler/inline.h" + +#include "prism/internal/char.h" +#include "prism/internal/allocator.h" + +#include <assert.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> /** * Initialize a pm_buffer_t with the given capacity. */ -bool -pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity) { +void +pm_buffer_init(pm_buffer_t *buffer, size_t capacity) { buffer->length = 0; buffer->capacity = capacity; buffer->value = (char *) xmalloc(capacity); - return buffer->value != NULL; + if (buffer->value == NULL) abort(); } /** - * Initialize a pm_buffer_t with its default values. + * Allocate and initialize a new buffer. */ -bool -pm_buffer_init(pm_buffer_t *buffer) { - return pm_buffer_init_capacity(buffer, 1024); +pm_buffer_t * +pm_buffer_new(void) { + pm_buffer_t *buffer = (pm_buffer_t *) xmalloc(sizeof(pm_buffer_t)); + if (buffer == NULL) abort(); + + pm_buffer_init(buffer, 1024); + return buffer; } /** @@ -47,9 +54,10 @@ pm_buffer_length(const pm_buffer_t *buffer) { /** * Append the given amount of space to the buffer. */ -static inline bool +static PRISM_INLINE bool pm_buffer_append_length(pm_buffer_t *buffer, size_t length) { size_t next_length = buffer->length + length; + const size_t original_capacity = buffer->capacity; if (next_length > buffer->capacity) { if (buffer->capacity == 0) { @@ -60,7 +68,7 @@ pm_buffer_append_length(pm_buffer_t *buffer, size_t length) { buffer->capacity *= 2; } - buffer->value = xrealloc(buffer->value, buffer->capacity); + buffer->value = xrealloc_sized(buffer->value, buffer->capacity, original_capacity); if (buffer->value == NULL) return false; } @@ -71,7 +79,7 @@ pm_buffer_append_length(pm_buffer_t *buffer, size_t length) { /** * Append a generic pointer to memory to the buffer. */ -static inline void +static PRISM_INLINE void pm_buffer_append(pm_buffer_t *buffer, const void *source, size_t length) { size_t cursor = buffer->length; if (pm_buffer_append_length(buffer, length)) { @@ -349,9 +357,18 @@ pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t le } /** - * Free the memory associated with the buffer. + * Free the memory held by the buffer. + */ +void +pm_buffer_cleanup(pm_buffer_t *buffer) { + xfree_sized(buffer->value, buffer->capacity); +} + +/** + * Free both the memory held by the buffer and the buffer itself. */ void pm_buffer_free(pm_buffer_t *buffer) { - xfree(buffer->value); + pm_buffer_cleanup(buffer); + xfree_sized(buffer, sizeof(pm_buffer_t)); } diff --git a/prism/buffer.h b/prism/buffer.h new file mode 100644 index 0000000000..24b572d2c3 --- /dev/null +++ b/prism/buffer.h @@ -0,0 +1,52 @@ +/** + * @file buffer.h + * + * A wrapper around a contiguous block of allocated memory. + */ +#ifndef PRISM_BUFFER_H +#define PRISM_BUFFER_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> + +/** + * A wrapper around a contiguous block of allocated memory. + */ +typedef struct pm_buffer_t pm_buffer_t; + +/** + * Allocate and initialize a new buffer. If the buffer cannot be allocated, this + * function will abort the process. + * + * @returns A pointer to the initialized buffer. The caller is responsible for + * freeing the buffer with pm_buffer_free. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_buffer_t * pm_buffer_new(void); + +/** + * Free both the memory held by the buffer and the buffer itself. + * + * @param buffer The buffer to free. + */ +PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer) PRISM_NONNULL(1); + +/** + * Return the value of the buffer. + * + * @param buffer The buffer to get the value of. + * @returns The value of the buffer. + */ +PRISM_EXPORTED_FUNCTION char * pm_buffer_value(const pm_buffer_t *buffer) PRISM_NONNULL(1); + +/** + * Return the length of the buffer. + * + * @param buffer The buffer to get the length of. + * @returns The length of the buffer. + */ +PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(const pm_buffer_t *buffer) PRISM_NONNULL(1); + +#endif diff --git a/prism/util/pm_char.c b/prism/char.c index a51dc11645..08e457aa1f 100644 --- a/prism/util/pm_char.c +++ b/prism/char.c @@ -1,7 +1,8 @@ -#include "prism/util/pm_char.h" +#include "prism/internal/char.h" + +#include "prism/compiler/inline.h" +#include "prism/internal/line_offset_list.h" -#define PRISM_CHAR_BIT_WHITESPACE (1 << 0) -#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1) #define PRISM_CHAR_BIT_REGEXP_OPTION (1 << 2) #define PRISM_NUMBER_BIT_BINARY_DIGIT (1 << 0) @@ -13,7 +14,7 @@ #define PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT (1 << 6) #define PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER (1 << 7) -static const uint8_t pm_byte_table[256] = { +const uint8_t pm_byte_table[256] = { // 0 1 2 3 4 5 6 7 8 9 A B C D E F 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3, 0, 0, // 0x 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x @@ -57,7 +58,7 @@ static const uint8_t pm_number_table[256] = { * Returns the number of characters at the start of the string that match the * given kind. Disallows searching past the given maximum number of characters. */ -static inline size_t +static PRISM_INLINE size_t pm_strspn_char_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { if (length <= 0) return 0; @@ -83,15 +84,15 @@ pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length) { * searching past the given maximum number of characters. */ size_t -pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list) { +pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_arena_t *arena, pm_line_offset_list_t *line_offsets, uint32_t start_offset) { if (length <= 0) return 0; - size_t size = 0; - size_t maximum = (size_t) length; + uint32_t size = 0; + uint32_t maximum = (uint32_t) length; while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_WHITESPACE)) { if (string[size] == '\n') { - pm_newline_list_append(newline_list, string + size); + pm_line_offset_list_append(arena, line_offsets, start_offset + size + 1); } size++; @@ -101,15 +102,6 @@ pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newlin } /** - * Returns the number of characters at the start of the string that are inline - * whitespace. Disallows searching past the given maximum number of characters. - */ -size_t -pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) { - return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_INLINE_WHITESPACE); -} - -/** * Returns the number of characters at the start of the string that are regexp * options. Disallows searching past the given maximum number of characters. */ @@ -118,36 +110,13 @@ pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length) { return pm_strspn_char_kind(string, length, PRISM_CHAR_BIT_REGEXP_OPTION); } -/** - * Returns true if the given character matches the given kind. - */ -static inline bool -pm_char_is_char_kind(const uint8_t b, uint8_t kind) { - return (pm_byte_table[b] & kind) != 0; -} - -/** - * Returns true if the given character is a whitespace character. - */ -bool -pm_char_is_whitespace(const uint8_t b) { - return pm_char_is_char_kind(b, PRISM_CHAR_BIT_WHITESPACE); -} - -/** - * Returns true if the given character is an inline whitespace character. - */ -bool -pm_char_is_inline_whitespace(const uint8_t b) { - return pm_char_is_char_kind(b, PRISM_CHAR_BIT_INLINE_WHITESPACE); -} /** * Scan through the string and return the number of characters at the start of * the string that match the given kind. Disallows searching past the given * maximum number of characters. */ -static inline size_t +static PRISM_INLINE size_t pm_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { if (length <= 0) return 0; @@ -166,7 +135,7 @@ pm_strspn_number_kind(const uint8_t *string, ptrdiff_t length, uint8_t kind) { * Additionally, report the location of the last invalid underscore character * found in the string through the out invalid parameter. */ -static inline size_t +static PRISM_INLINE size_t pm_strspn_number_kind_underscores(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid, uint8_t kind) { if (length <= 0) return 0; @@ -267,7 +236,7 @@ pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint /** * Returns true if the given character matches the given kind. */ -static inline bool +static PRISM_INLINE bool pm_char_is_number_kind(const uint8_t b, uint8_t kind) { return (pm_number_table[b] & kind) != 0; } @@ -303,16 +272,3 @@ bool pm_char_is_hexadecimal_digit(const uint8_t b) { return pm_char_is_number_kind(b, PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT); } - -#undef PRISM_CHAR_BIT_WHITESPACE -#undef PRISM_CHAR_BIT_INLINE_WHITESPACE -#undef PRISM_CHAR_BIT_REGEXP_OPTION - -#undef PRISM_NUMBER_BIT_BINARY_DIGIT -#undef PRISM_NUMBER_BIT_BINARY_NUMBER -#undef PRISM_NUMBER_BIT_OCTAL_DIGIT -#undef PRISM_NUMBER_BIT_OCTAL_NUMBER -#undef PRISM_NUMBER_BIT_DECIMAL_DIGIT -#undef PRISM_NUMBER_BIT_DECIMAL_NUMBER -#undef PRISM_NUMBER_BIT_HEXADECIMAL_NUMBER -#undef PRISM_NUMBER_BIT_HEXADECIMAL_DIGIT diff --git a/prism/comments.h b/prism/comments.h new file mode 100644 index 0000000000..2270d53889 --- /dev/null +++ b/prism/comments.h @@ -0,0 +1,43 @@ +/** + * @file comments.h + * + * Types and functions related to comments found during parsing. + */ +#ifndef PRISM_COMMENTS_H +#define PRISM_COMMENTS_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include "prism/ast.h" + +#include <stddef.h> + +/** This is the type of a comment that we've found while parsing. */ +typedef enum { + PM_COMMENT_INLINE, + PM_COMMENT_EMBDOC +} pm_comment_type_t; + +/** An opaque pointer to a comment found while parsing. */ +typedef struct pm_comment_t pm_comment_t; + +/** + * Returns the location associated with the given comment. + * + * @param comment the comment whose location we want to get + * @returns the location associated with the given comment + */ +PRISM_EXPORTED_FUNCTION pm_location_t pm_comment_location(const pm_comment_t *comment) PRISM_NONNULL(1); + +/** + * Returns the type associated with the given comment. + * + * @param comment the comment whose type we want to get + * @returns the type associated with the given comment. This can either be + * PM_COMMENT_INLINE or PM_COMMENT_EMBDOC. + */ +PRISM_EXPORTED_FUNCTION pm_comment_type_t pm_comment_type(const pm_comment_t *comment) PRISM_NONNULL(1); + +#endif diff --git a/prism/compiler/accel.h b/prism/compiler/accel.h new file mode 100644 index 0000000000..be23236d1d --- /dev/null +++ b/prism/compiler/accel.h @@ -0,0 +1,19 @@ +/** + * @file compiler/accel.h + */ +#ifndef PRISM_COMPILER_ACCEL_H +#define PRISM_COMPILER_ACCEL_H + +/** + * Platform detection for SIMD/fast-path implementations. At most one of these + * macros is defined, selecting the best available vectorization strategy. + */ +#if (defined(__aarch64__) && defined(__ARM_NEON)) || (defined(_MSC_VER) && defined(_M_ARM64)) +# define PRISM_HAS_NEON +#elif (defined(__x86_64__) && defined(__SSSE3__)) || (defined(_MSC_VER) && defined(_M_X64)) +# define PRISM_HAS_SSSE3 +#elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +# define PRISM_HAS_SWAR +#endif + +#endif diff --git a/prism/compiler/align.h b/prism/compiler/align.h new file mode 100644 index 0000000000..22cb49a48c --- /dev/null +++ b/prism/compiler/align.h @@ -0,0 +1,36 @@ +/** + * @file compiler/align.h + */ +#ifndef PRISM_COMPILER_ALIGN_H +#define PRISM_COMPILER_ALIGN_H + +/** + * Compiler-agnostic macros for specifying alignment of types and variables. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L /* C11 or later */ + /** Specify alignment for a type or variable. */ + #define PRISM_ALIGNAS _Alignas + + /** Get the alignment requirement of a type. */ + #define PRISM_ALIGNOF _Alignof +#elif defined(__GNUC__) || defined(__clang__) + /** Specify alignment for a type or variable. */ + #define PRISM_ALIGNAS(size) __attribute__((aligned(size))) + + /** Get the alignment requirement of a type. */ + #define PRISM_ALIGNOF(type) __alignof__(type) +#elif defined(_MSC_VER) + /** Specify alignment for a type or variable. */ + #define PRISM_ALIGNAS(size) __declspec(align(size)) + + /** Get the alignment requirement of a type. */ + #define PRISM_ALIGNOF(type) __alignof(type) +#else + /** Void because this platform does not support specifying alignment. */ + #define PRISM_ALIGNAS(size) + + /** Fallback to sizeof as alignment requirement of a type. */ + #define PRISM_ALIGNOF(type) sizeof(type) +#endif + +#endif diff --git a/prism/compiler/exported.h b/prism/compiler/exported.h new file mode 100644 index 0000000000..823773ecbb --- /dev/null +++ b/prism/compiler/exported.h @@ -0,0 +1,24 @@ +/** + * @file compiler/exported.h + */ +#ifndef PRISM_COMPILER_EXPORTED_H +#define PRISM_COMPILER_EXPORTED_H + +/** + * By default, we compile with -fvisibility=hidden. When this is enabled, we + * need to mark certain functions as being publically-visible. This macro does + * that in a compiler-agnostic way. + */ +#ifndef PRISM_EXPORTED_FUNCTION +# ifdef PRISM_EXPORT_SYMBOLS +# ifdef _WIN32 +# define PRISM_EXPORTED_FUNCTION __declspec(dllexport) extern +# else +# define PRISM_EXPORTED_FUNCTION __attribute__((__visibility__("default"))) extern +# endif +# else +# define PRISM_EXPORTED_FUNCTION +# endif +#endif + +#endif diff --git a/prism/compiler/fallthrough.h b/prism/compiler/fallthrough.h new file mode 100644 index 0000000000..ce1b450e8a --- /dev/null +++ b/prism/compiler/fallthrough.h @@ -0,0 +1,22 @@ +/** + * @file compiler/fallthrough.h + */ +#ifndef PRISM_COMPILER_FALLTHROUGH_H +#define PRISM_COMPILER_FALLTHROUGH_H + +/** + * We use -Wimplicit-fallthrough to guard potentially unintended fall-through + * between cases of a switch. Use PRISM_FALLTHROUGH to explicitly annotate cases + * where the fallthrough is intentional. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L /* C23 or later */ + #define PRISM_FALLTHROUGH [[fallthrough]]; +#elif defined(__GNUC__) || defined(__clang__) + #define PRISM_FALLTHROUGH __attribute__((fallthrough)); +#elif defined(_MSC_VER) + #define PRISM_FALLTHROUGH __fallthrough; +#else + #define PRISM_FALLTHROUGH +#endif + +#endif diff --git a/prism/compiler/filesystem.h b/prism/compiler/filesystem.h new file mode 100644 index 0000000000..f988909db8 --- /dev/null +++ b/prism/compiler/filesystem.h @@ -0,0 +1,32 @@ +/** + * @file compiler/filesystem.h + * + * Platform detection for mmap and filesystem support. + */ +#ifndef PRISM_COMPILER_FILESYSTEM_H +#define PRISM_COMPILER_FILESYSTEM_H + +/** + * In general, libc for embedded systems does not support memory-mapped files. + * If the target platform is POSIX or Windows, we can map a file in memory and + * read it in a more efficient manner. + */ +#ifdef _WIN32 +# define PRISM_HAS_MMAP +#else +# include <unistd.h> +# ifdef _POSIX_MAPPED_FILES +# define PRISM_HAS_MMAP +# endif +#endif + +/** + * If PRISM_HAS_NO_FILESYSTEM is defined, then we want to exclude all filesystem + * related code from the library. All filesystem related code should be guarded + * by PRISM_HAS_FILESYSTEM. + */ +#ifndef PRISM_HAS_NO_FILESYSTEM +# define PRISM_HAS_FILESYSTEM +#endif + +#endif diff --git a/prism/compiler/flex_array.h b/prism/compiler/flex_array.h new file mode 100644 index 0000000000..7504b5fdd3 --- /dev/null +++ b/prism/compiler/flex_array.h @@ -0,0 +1,19 @@ +/** + * @file compiler/flex_array.h + */ +#ifndef PRISM_COMPILER_FLEX_ARRAY_H +#define PRISM_COMPILER_FLEX_ARRAY_H + +/** + * A macro for helper define a flexible array member. C99 supports `data[]`, GCC + * supports `data[0]` as an extension, and older compilers require `data[1]`. + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) + #define PM_FLEX_ARRAY_LENGTH /* data[] */ +#elif defined(__GNUC__) && !defined(__STRICT_ANSI__) + #define PM_FLEX_ARRAY_LENGTH 0 /* data[0] */ +#else + #define PM_FLEX_ARRAY_LENGTH 1 /* data[1] */ +#endif + +#endif diff --git a/prism/compiler/force_inline.h b/prism/compiler/force_inline.h new file mode 100644 index 0000000000..e189d592d6 --- /dev/null +++ b/prism/compiler/force_inline.h @@ -0,0 +1,21 @@ +/** + * @file compiler/force_inline.h + */ +#ifndef PRISM_COMPILER_FORCE_INLINE_H +#define PRISM_COMPILER_FORCE_INLINE_H + +#include "prism/compiler/inline.h" + +/** + * Force a function to be inlined at every call site. Use sparingly — only for + * small, hot functions where the compiler's heuristics fail to inline. + */ +#if defined(_MSC_VER) +# define PRISM_FORCE_INLINE __forceinline +#elif defined(__GNUC__) || defined(__clang__) +# define PRISM_FORCE_INLINE PRISM_INLINE __attribute__((always_inline)) +#else +# define PRISM_FORCE_INLINE PRISM_INLINE +#endif + +#endif diff --git a/prism/compiler/format.h b/prism/compiler/format.h new file mode 100644 index 0000000000..32f4c3c6d7 --- /dev/null +++ b/prism/compiler/format.h @@ -0,0 +1,25 @@ +/** + * @file compiler/format.h + */ +#ifndef PRISM_COMPILER_FORMAT_H +#define PRISM_COMPILER_FORMAT_H + +/** + * Certain compilers support specifying that a function accepts variadic + * parameters that look like printf format strings to provide a better developer + * experience when someone is using the function. This macro does that in a + * compiler-agnostic way. + */ +#if defined(__GNUC__) +# if defined(__MINGW_PRINTF_FORMAT) +# define PRISM_ATTRIBUTE_FORMAT(fmt_idx_, arg_idx_) __attribute__((format(__MINGW_PRINTF_FORMAT, fmt_idx_, arg_idx_))) +# else +# define PRISM_ATTRIBUTE_FORMAT(fmt_idx_, arg_idx_) __attribute__((format(printf, fmt_idx_, arg_idx_))) +# endif +#elif defined(__clang__) +# define PRISM_ATTRIBUTE_FORMAT(fmt_idx_, arg_idx_) __attribute__((__format__(__printf__, fmt_idx_, arg_idx_))) +#else +# define PRISM_ATTRIBUTE_FORMAT(fmt_idx_, arg_idx_) +#endif + +#endif diff --git a/prism/compiler/inline.h b/prism/compiler/inline.h new file mode 100644 index 0000000000..856a375691 --- /dev/null +++ b/prism/compiler/inline.h @@ -0,0 +1,17 @@ +/** + * @file compiler/inline.h + */ +#ifndef PRISM_COMPILER_INLINE_H +#define PRISM_COMPILER_INLINE_H + +/** + * Old Visual Studio versions do not support the inline keyword, so we need to + * define it to be __inline. + */ +#if defined(_MSC_VER) && !defined(inline) +# define PRISM_INLINE __inline +#else +# define PRISM_INLINE inline +#endif + +#endif diff --git a/prism/compiler/nodiscard.h b/prism/compiler/nodiscard.h new file mode 100644 index 0000000000..ccd6c00719 --- /dev/null +++ b/prism/compiler/nodiscard.h @@ -0,0 +1,22 @@ +/** + * @file compiler/nodiscard.h + */ +#ifndef PRISM_COMPILER_NODISCARD_H +#define PRISM_COMPILER_NODISCARD_H + +/** + * Mark the return value of a function as important so that the compiler warns + * if a caller ignores it. This is useful for functions that return error codes + * or allocated resources that must be freed. + */ +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L +# define PRISM_NODISCARD [[nodiscard]] +#elif defined(__GNUC__) || defined(__clang__) +# define PRISM_NODISCARD __attribute__((__warn_unused_result__)) +#elif defined(_MSC_VER) +# define PRISM_NODISCARD _Check_return_ +#else +# define PRISM_NODISCARD +#endif + +#endif diff --git a/prism/compiler/nonnull.h b/prism/compiler/nonnull.h new file mode 100644 index 0000000000..9d19355665 --- /dev/null +++ b/prism/compiler/nonnull.h @@ -0,0 +1,18 @@ +/** + * @file compiler/nonnull.h + */ +#ifndef PRISM_COMPILER_NONNULL_H +#define PRISM_COMPILER_NONNULL_H + +/** + * Mark the parameters of a function as non-null. This allows the compiler to + * warn if a caller passes NULL for a parameter that should never be NULL. The + * arguments are the 1-based indices of the parameters. + */ +#if defined(__GNUC__) || defined(__clang__) +# define PRISM_NONNULL(...) __attribute__((__nonnull__(__VA_ARGS__))) +#else +# define PRISM_NONNULL(...) +#endif + +#endif diff --git a/prism/compiler/unused.h b/prism/compiler/unused.h new file mode 100644 index 0000000000..6a9e125dde --- /dev/null +++ b/prism/compiler/unused.h @@ -0,0 +1,18 @@ +/** + * @file compiler/unused.h + */ +#ifndef PRISM_COMPILER_UNUSED_H +#define PRISM_COMPILER_UNUSED_H + +/** + * GCC will warn if you specify a function or parameter that is unused at + * runtime. This macro allows you to mark a function or parameter as unused in a + * compiler-agnostic way. + */ +#if defined(__GNUC__) +# define PRISM_UNUSED __attribute__((unused)) +#else +# define PRISM_UNUSED +#endif + +#endif diff --git a/prism/config.yml b/prism/config.yml index 3d5eee190f..bbbc5f3d33 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -17,6 +17,8 @@ errors: - ARGUMENT_FORWARDING_UNBOUND - ARGUMENT_NO_FORWARDING_AMPERSAND - ARGUMENT_NO_FORWARDING_ELLIPSES + - ARGUMENT_NO_FORWARDING_ELLIPSES_LAMBDA + - ARGUMENT_NO_FORWARDING_ELLIPSES_BLOCK - ARGUMENT_NO_FORWARDING_STAR - ARGUMENT_NO_FORWARDING_STAR_STAR - ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT @@ -60,7 +62,9 @@ errors: - CONDITIONAL_WHILE_PREDICATE - CONSTANT_PATH_COLON_COLON_CONSTANT - DEF_ENDLESS + - DEF_ENDLESS_PARAMETERS - DEF_ENDLESS_SETTER + - DEF_ENDLESS_DO_BLOCK - DEF_NAME - DEF_PARAMS_TERM - DEF_PARAMS_TERM_PAREN @@ -101,6 +105,8 @@ errors: - EXPECT_FOR_DELIMITER - EXPECT_IDENT_REQ_PARAMETER - EXPECT_IN_DELIMITER + - EXPECT_LPAREN_AFTER_NOT_LPAREN + - EXPECT_LPAREN_AFTER_NOT_OTHER - EXPECT_LPAREN_REQ_PARAMETER - EXPECT_MESSAGE - EXPECT_RBRACKET @@ -216,6 +222,7 @@ errors: - PARAMETER_WILD_LOOSE_COMMA - PATTERN_ARRAY_MULTIPLE_RESTS - PATTERN_CAPTURE_DUPLICATE + - PATTERN_CAPTURE_IN_ALTERNATIVE - PATTERN_EXPRESSION_AFTER_BRACKET - PATTERN_EXPRESSION_AFTER_COMMA - PATTERN_EXPRESSION_AFTER_HROCKET @@ -241,7 +248,9 @@ errors: - PATTERN_TERM_PAREN - PIPEPIPEEQ_MULTI_ASSIGN - REGEXP_ENCODING_OPTION_MISMATCH + - REGEXP_ESCAPED_NON_ASCII_IN_UTF8 - REGEXP_INCOMPAT_CHAR_ENCODING + - REGEXP_INVALID_CHAR_PROPERTY - REGEXP_INVALID_UNICODE_RANGE - REGEXP_NON_ESCAPED_MBC - REGEXP_PARSE_ERROR @@ -277,6 +286,7 @@ errors: - UNEXPECTED_INDEX_KEYWORDS - UNEXPECTED_LABEL - UNEXPECTED_MULTI_WRITE + - UNEXPECTED_PARAMETER_DEFAULT_VALUE - UNEXPECTED_RANGE_OPERATOR - UNEXPECTED_SAFE_NAVIGATION - UNEXPECTED_TOKEN_CLOSE_CONTEXT @@ -320,13 +330,44 @@ warnings: - UNUSED_LOCAL_VARIABLE - VOID_STATEMENT tokens: + # The order of the tokens at the beginning is important, because we use them + # for a lookup table. - name: EOF value: 1 comment: final token in the file - - name: MISSING - comment: "a token that was expected but not found" - - name: NOT_PROVIDED - comment: "a token that was not present but it is okay" + - name: BRACE_RIGHT + comment: "}" + - name: COMMA + comment: "," + - name: EMBEXPR_END + comment: "}" + - name: KEYWORD_DO + comment: "do" + - name: KEYWORD_ELSE + comment: "else" + - name: KEYWORD_ELSIF + comment: "elsif" + - name: KEYWORD_END + comment: "end" + - name: KEYWORD_ENSURE + comment: "ensure" + - name: KEYWORD_IN + comment: "in" + - name: KEYWORD_RESCUE + comment: "rescue" + - name: KEYWORD_THEN + comment: "then" + - name: KEYWORD_WHEN + comment: "when" + - name: NEWLINE + comment: "a newline character outside of other tokens" + - name: PARENTHESIS_RIGHT + comment: ")" + - name: PIPE + comment: "|" + - name: SEMICOLON + comment: ";" + # Tokens from here on are not used for lookup, and can be in any order. - name: AMPERSAND comment: "&" - name: AMPERSAND_AMPERSAND @@ -349,8 +390,6 @@ tokens: comment: "!~" - name: BRACE_LEFT comment: "{" - - name: BRACE_RIGHT - comment: "}" - name: BRACKET_LEFT comment: "[" - name: BRACKET_LEFT_ARRAY @@ -373,8 +412,6 @@ tokens: comment: ":" - name: COLON_COLON comment: "::" - - name: COMMA - comment: "," - name: COMMENT comment: "a comment" - name: CONSTANT @@ -393,8 +430,6 @@ tokens: comment: "a line inside of embedded documentation" - name: EMBEXPR_BEGIN comment: "#{" - - name: EMBEXPR_END - comment: "}" - name: EMBVAR comment: "#" - name: EQUAL @@ -461,20 +496,12 @@ tokens: comment: "def" - name: KEYWORD_DEFINED comment: "defined?" - - name: KEYWORD_DO - comment: "do" + - name: KEYWORD_DO_BLOCK + comment: "do keyword for a block attached to a command" - name: KEYWORD_DO_LOOP comment: "do keyword for a predicate in a while, until, or for loop" - - name: KEYWORD_ELSE - comment: "else" - - name: KEYWORD_ELSIF - comment: "elsif" - - name: KEYWORD_END - comment: "end" - name: KEYWORD_END_UPCASE comment: "END" - - name: KEYWORD_ENSURE - comment: "ensure" - name: KEYWORD_FALSE comment: "false" - name: KEYWORD_FOR @@ -483,8 +510,6 @@ tokens: comment: "if" - name: KEYWORD_IF_MODIFIER comment: "if in the modifier form" - - name: KEYWORD_IN - comment: "in" - name: KEYWORD_MODULE comment: "module" - name: KEYWORD_NEXT @@ -497,8 +522,6 @@ tokens: comment: "or" - name: KEYWORD_REDO comment: "redo" - - name: KEYWORD_RESCUE - comment: "rescue" - name: KEYWORD_RESCUE_MODIFIER comment: "rescue in the modifier form" - name: KEYWORD_RETRY @@ -509,8 +532,6 @@ tokens: comment: "self" - name: KEYWORD_SUPER comment: "super" - - name: KEYWORD_THEN - comment: "then" - name: KEYWORD_TRUE comment: "true" - name: KEYWORD_UNDEF @@ -523,8 +544,6 @@ tokens: comment: "until" - name: KEYWORD_UNTIL_MODIFIER comment: "until in the modifier form" - - name: KEYWORD_WHEN - comment: "when" - name: KEYWORD_WHILE comment: "while" - name: KEYWORD_WHILE_MODIFIER @@ -561,16 +580,12 @@ tokens: comment: "-=" - name: MINUS_GREATER comment: "->" - - name: NEWLINE - comment: "a newline character outside of other tokens" - name: NUMBERED_REFERENCE comment: "a numbered reference to a capture group in the previous regular expression match" - name: PARENTHESIS_LEFT comment: "(" - name: PARENTHESIS_LEFT_PARENTHESES comment: "( for a parentheses node" - - name: PARENTHESIS_RIGHT - comment: ")" - name: PERCENT comment: "%" - name: PERCENT_EQUAL @@ -585,8 +600,6 @@ tokens: comment: "%I" - name: PERCENT_UPPER_W comment: "%W" - - name: PIPE - comment: "|" - name: PIPE_EQUAL comment: "|=" - name: PIPE_PIPE @@ -603,8 +616,6 @@ tokens: comment: "the beginning of a regular expression" - name: REGEXP_END comment: "the end of a regular expression" - - name: SEMICOLON - comment: ";" - name: SLASH comment: "/" - name: SLASH_EQUAL @@ -803,8 +814,6 @@ nodes: - GlobalVariableReadNode - BackReferenceReadNode - NumberedReferenceReadNode - - on error: SymbolNode # alias $a b - - on error: MissingNode # alias $a 42 comment: | Represents the old name of the global variable that can be used before aliasing. @@ -813,7 +822,7 @@ nodes: - name: keyword_loc type: location comment: | - The location of the `alias` keyword. + The Location of the `alias` keyword. alias $foo $bar ^^^^^ @@ -845,8 +854,6 @@ nodes: kind: - SymbolNode - InterpolatedSymbolNode - - on error: GlobalVariableReadNode # alias a $b - - on error: MissingNode # alias a 42 comment: | Represents the old name of the method that will be aliased. @@ -861,7 +868,7 @@ nodes: - name: keyword_loc type: location comment: | - Represents the location of the `alias` keyword. + Represents the Location of the `alias` keyword. alias foo bar ^^^^^ @@ -891,7 +898,7 @@ nodes: - name: operator_loc type: location comment: | - Represents the alternation operator location. + Represents the alternation operator Location. foo => bar | baz ^ @@ -927,7 +934,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `and` keyword or the `&&` operator. + The Location of the `and` keyword or the `&&` operator. left and right ^^^ @@ -962,7 +969,7 @@ nodes: - name: opening_loc type: location? comment: | - Represents the optional source location for the opening token. + Represents the optional source Location for the opening token. [1,2,3] # "[" %w[foo bar baz] # "%w[" @@ -971,7 +978,7 @@ nodes: - name: closing_loc type: location? comment: | - Represents the optional source location for the closing token. + Represents the optional source Location for the closing token. [1,2,3] # "]" %w[foo bar baz] # "]" @@ -987,8 +994,19 @@ nodes: - name: constant type: node? kind: - - ConstantReadNode - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the Array + + foo in Bar[] + ^^^ + + foo in Bar[1, 2, 3] + ^^^ + + foo in Bar::Baz[1, 2, 3] + ^^^^^^^^ - name: requireds type: node[] kind: pattern expression @@ -999,7 +1017,9 @@ nodes: ^ ^ - name: rest type: node? - kind: pattern expression + kind: + - ImplicitRestNode + - SplatNode comment: | Represents the rest element of the array pattern. @@ -1016,14 +1036,14 @@ nodes: - name: opening_loc type: location? comment: | - Represents the opening location of the array pattern. + Represents the opening Location of the array pattern. foo in [1, 2] ^ - name: closing_loc type: location? comment: | - Represents the closing location of the array pattern. + Represents the closing Location of the array pattern. foo in [1, 2] ^ @@ -1031,19 +1051,19 @@ nodes: Represents an array pattern in pattern matching. foo in 1, 2 - ^^^^^^^^^^^ + ^^^^ foo in [1, 2] - ^^^^^^^^^^^^^ + ^^^^^^ foo in *bar - ^^^^^^^^^^^ + ^^^^ foo in Bar[] - ^^^^^^^^^^^^ + ^^^^^ foo in Bar[1, 2, 3] - ^^^^^^^^^^^^^^^^^^^ + ^^^^^^^^^^^^ - name: AssocNode fields: - name: key @@ -1074,7 +1094,7 @@ nodes: - name: operator_loc type: location? comment: | - The location of the `=>` operator, if present. + The Location of the `=>` operator, if present. { foo => bar } ^^ @@ -1096,7 +1116,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `**` operator. + The Location of the `**` operator. { **x } ^^ @@ -1125,7 +1145,7 @@ nodes: - name: begin_keyword_loc type: location? comment: | - Represents the location of the `begin` keyword. + Represents the Location of the `begin` keyword. begin x end ^^^^^ @@ -1152,7 +1172,7 @@ nodes: Represents the else clause within the begin block. begin x; rescue y; else z; end - ^^^^^^ + ^^^^^^^^^^^ - name: ensure_clause type: node? kind: EnsureNode @@ -1164,7 +1184,7 @@ nodes: - name: end_keyword_loc type: location? comment: | - Represents the location of the `end` keyword. + Represents the Location of the `end` keyword. begin x end ^^^ @@ -1185,11 +1205,11 @@ nodes: The expression that is being passed as a block argument. This can be any [non-void expression](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). foo(&args) - ^^^^^ + ^^^^ - name: operator_loc type: location comment: | - Represents the location of the `&` operator. + Represents the Location of the `&` operator. foo(&args) ^ @@ -1197,7 +1217,7 @@ nodes: Represents a block argument using `&`. bar(&args) - ^^^^^^^^^^ + ^^^^^ - name: BlockLocalVariableNode flags: ParameterFlags fields: @@ -1250,17 +1270,17 @@ nodes: - name: opening_loc type: location comment: | - Represents the location of the opening `|`. + Represents the Location of the opening `{` or `do`. [1, 2, 3].each { |i| puts x } - ^ + ^ - name: closing_loc type: location comment: | - Represents the location of the closing `|`. + Represents the Location of the closing `}` or `end`. [1, 2, 3].each { |i| puts x } - ^ + ^ comment: | Represents a block of ruby code. @@ -1280,14 +1300,14 @@ nodes: - name: name_loc type: location? comment: | - Represents the location of the block parameter name. + Represents the Location of the block parameter name. def a(&b) ^ - name: operator_loc type: location comment: | - Represents the location of the `&` operator. + Represents the Location of the `&` operator. def a(&b) ^ @@ -1327,7 +1347,7 @@ nodes: - name: opening_loc type: location? comment: | - Represents the opening location of the block parameters. + Represents the opening Location of the block parameters. -> (a, b = 1; local) { } ^ @@ -1338,7 +1358,7 @@ nodes: - name: closing_loc type: location? comment: | - Represents the closing location of the block parameters. + Represents the closing Location of the block parameters. -> (a, b = 1; local) { } ^ @@ -1368,7 +1388,7 @@ nodes: - name: keyword_loc type: location comment: | - The location of the `break` keyword. + The Location of the `break` keyword. break foo ^^^^^ @@ -1391,14 +1411,14 @@ nodes: - name: call_operator_loc type: location? comment: | - Represents the location of the call operator. + Represents the Location of the call operator. foo.bar &&= value ^ - name: message_loc type: location? comment: | - Represents the location of the message. + Represents the Location of the message. foo.bar &&= value ^^^ @@ -1419,7 +1439,7 @@ nodes: - name: operator_loc type: location comment: | - Represents the location of the operator. + Represents the Location of the operator. foo.bar &&= value ^^^ @@ -1456,7 +1476,7 @@ nodes: - name: call_operator_loc type: location? comment: | - Represents the location of the call operator. + Represents the Location of the call operator. foo.bar ^ @@ -1473,14 +1493,15 @@ nodes: - name: message_loc type: location? comment: | - Represents the location of the message. + Represents the Location of the message. foo.bar ^^^ - name: opening_loc type: location? comment: | - Represents the location of the left parenthesis. + Represents the Location of the left parenthesis. + foo(bar) ^ - name: arguments @@ -1494,10 +1515,20 @@ nodes: - name: closing_loc type: location? comment: | - Represents the location of the right parenthesis. + Represents the Location of the right parenthesis. foo(bar) ^ + - name: equal_loc + type: location? + comment: | + Represents the Location of the equal sign, in the case that this is an attribute write. + + foo.bar = value + ^ + + foo[bar] = value + ^ - name: block type: node? kind: @@ -1542,14 +1573,14 @@ nodes: - name: call_operator_loc type: location? comment: | - Represents the location of the call operator. + Represents the Location of the call operator. foo.bar += value ^ - name: message_loc type: location? comment: | - Represents the location of the message. + Represents the Location of the message. foo.bar += value ^^^ @@ -1577,7 +1608,7 @@ nodes: - name: binary_operator_loc type: location comment: | - Represents the location of the binary operator. + Represents the Location of the binary operator. foo.bar += value ^^ @@ -1608,14 +1639,14 @@ nodes: - name: call_operator_loc type: location? comment: | - Represents the location of the call operator. + Represents the Location of the call operator. foo.bar ||= value ^ - name: message_loc type: location? comment: | - Represents the location of the message. + Represents the Location of the message. foo.bar ||= value ^^^ @@ -1636,7 +1667,7 @@ nodes: - name: operator_loc type: location comment: | - Represents the location of the operator. + Represents the Location of the operator. foo.bar ||= value ^^^ @@ -1667,7 +1698,7 @@ nodes: - name: call_operator_loc type: location comment: | - Represents the location of the call operator. + Represents the Location of the call operator. foo.bar = 1 ^ @@ -1681,7 +1712,7 @@ nodes: - name: message_loc type: location comment: | - Represents the location of the message. + Represents the Location of the message. foo.bar = 1 ^^^ @@ -1719,7 +1750,7 @@ nodes: - name: operator_loc type: location comment: | - Represents the location of the `=>` operator. + Represents the Location of the `=>` operator. foo => bar ^^ @@ -1727,7 +1758,7 @@ nodes: Represents assigning to a local variable in pattern matching. foo => [bar => baz] - ^^^^^^^^^^^^ + ^^^^^^^^^^ - name: CaseMatchNode fields: - name: predicate @@ -1737,7 +1768,7 @@ nodes: Represents the predicate of the case match. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). case true; in false; end - ^^^^ + ^^^^ - name: conditions type: node[] kind: InNode @@ -1753,18 +1784,18 @@ nodes: Represents the else clause of the case match. case true; in false; else; end - ^^^^ + ^^^^^^^^^ - name: case_keyword_loc type: location comment: | - Represents the location of the `case` keyword. + Represents the Location of the `case` keyword. case true; in false; end ^^^^ - name: end_keyword_loc type: location comment: | - Represents the location of the `end` keyword. + Represents the Location of the `end` keyword. case true; in false; end ^^^ @@ -1784,7 +1815,7 @@ nodes: Represents the predicate of the case statement. This can be either `nil` or any [non-void expressions](https://github.com/ruby/prism/blob/main/docs/parsing_rules.md#non-void-expression). case true; when false; end - ^^^^ + ^^^^ - name: conditions type: node[] kind: WhenNode @@ -1800,18 +1831,18 @@ nodes: Represents the else clause of the case statement. case true; when false; else; end - ^^^^ + ^^^^^^^^^ - name: case_keyword_loc type: location comment: | - Represents the location of the `case` keyword. + Represents the Location of the `case` keyword. case true; when false; end ^^^^ - name: end_keyword_loc type: location comment: | - Represents the location of the `end` keyword. + Represents the Location of the `end` keyword. case true; when false; end ^^^ @@ -1828,26 +1859,54 @@ nodes: type: constant[] - name: class_keyword_loc type: location + comment: | + Represents the Location of the `class` keyword. + + class Foo end + ^^^^^ - name: constant_path type: node kind: - ConstantReadNode - ConstantPathNode - - on error: CallNode # class 0.X end - name: inheritance_operator_loc type: location? + comment: | + Represents the Location of the `<` operator. + + class Foo < Bar + ^ - name: superclass type: node? kind: non-void expression + comment: | + Represents the superclass of the class. + + class Foo < Bar + ^^^ - name: body type: node? kind: - StatementsNode - BeginNode + comment: | + Represents the body of the class. + + class Foo; bar; end + ^^^ - name: end_keyword_loc type: location + comment: | + Represents the Location of the `end` keyword. + + class Foo end + ^^^ - name: name type: constant + comment: | + The name of the class. + + class Foo end # name `:Foo` comment: | Represents a class declaration involving the `class` keyword. @@ -1865,14 +1924,14 @@ nodes: - name: name_loc type: location comment: | - Represents the location of the variable name. + Represents the Location of the variable name. @@target &&= value ^^^^^^^^ - name: operator_loc type: location comment: | - Represents the location of the `&&=` operator. + Represents the Location of the `&&=` operator. @@target &&= value ^^^ @@ -1960,7 +2019,7 @@ nodes: - name: name_loc type: location comment: | - The location of the variable name. + The Location of the variable name. @@foo = :bar ^^^^^ @@ -1978,7 +2037,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. @@foo = :bar ^ @@ -2074,7 +2133,7 @@ nodes: - name: delimiter_loc type: location comment: | - The location of the `::` delimiter. + The Location of the `::` delimiter. ::Foo ^^ @@ -2084,7 +2143,7 @@ nodes: - name: name_loc type: location comment: | - The location of the name of the constant. + The Location of the name of the constant. ::Foo ^^^ @@ -2160,7 +2219,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. ::ABC = 123 ^ @@ -2220,7 +2279,7 @@ nodes: - name: name_loc type: location comment: | - The location of the constant name. + The Location of the constant name. FOO = 1 ^^^ @@ -2238,7 +2297,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. FOO = :bar ^ @@ -2363,6 +2422,15 @@ nodes: ^^^^^^ bar end + - name: ErrorRecoveryNode + fields: + - name: unexpected + type: node? + kind: Node + comment: | + The unexpected node that was found in the tree, if there was one. + comment: | + Represents a node that is either missing or unexpected and results in a syntax error. - name: FalseNode comment: | Represents the use of the literal `false` keyword. @@ -2374,23 +2442,66 @@ nodes: - name: constant type: node? kind: - - ConstantReadNode - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the pattern + + foo in Foo(*bar, baz, *qux) + ^^^ - name: left type: node kind: SplatNode + comment: | + Represents the first wildcard node in the pattern. + + foo in *bar, baz, *qux + ^^^^ + + foo in Foo(*bar, baz, *qux) + ^^^^ - name: requireds type: node[] kind: pattern expression + comment: | + Represents the nodes in between the wildcards. + + foo in *bar, baz, *qux + ^^^ + + foo in Foo(*bar, baz, 1, *qux) + ^^^^^^ - name: right type: node - kind: - - SplatNode - - on error: MissingNode + kind: SplatNode + comment: | + Represents the second wildcard node in the pattern. + + foo in *bar, baz, *qux + ^^^^ + + foo in Foo(*bar, baz, *qux) + ^^^^ - name: opening_loc type: location? + comment: | + The Location of the opening brace. + + foo in [*bar, baz, *qux] + ^ + + foo in Foo(*bar, baz, *qux) + ^ - name: closing_loc type: location? + comment: | + The Location of the closing brace. + + foo in [*bar, baz, *qux] + ^ + + foo in Foo(*bar, baz, *qux) + ^ comment: | Represents a find pattern in pattern matching. @@ -2402,6 +2513,9 @@ nodes: foo in Foo(*bar, baz, *qux) ^^^^^^^^^^^^^^^^^^^^ + + foo => *bar, baz, *qux + ^^^^^^^^^^^^^^^ - name: FlipFlopNode flags: RangeFlags fields: @@ -2442,9 +2556,6 @@ nodes: - CallTargetNode - IndexTargetNode - MultiTargetNode - - on error: BackReferenceReadNode # for $& in a end - - on error: NumberedReferenceReadNode # for $1 in a end - - on error: MissingNode # for in 1..10; end comment: | The index expression for `for` loops. @@ -2471,28 +2582,28 @@ nodes: - name: for_keyword_loc type: location comment: | - The location of the `for` keyword. + The Location of the `for` keyword. for i in a end ^^^ - name: in_keyword_loc type: location comment: | - The location of the `in` keyword. + The Location of the `in` keyword. for i in a end ^^ - name: do_keyword_loc type: location? comment: | - The location of the `do` keyword, if present. + The Location of the `do` keyword, if present. for i in a do end ^^ - name: end_keyword_loc type: location comment: | - The location of the `end` keyword. + The Location of the `end` keyword. for i in a end ^^^ @@ -2518,14 +2629,29 @@ nodes: end - name: ForwardingSuperNode fields: + - name: keyword_loc + type: location + comment: | + super + ^^^^^ + + super { 123 } + ^^^^^ - name: block type: node? kind: BlockNode + comment: | + All other arguments are forwarded as normal, except the original block is replaced with the new block. comment: | - Represents the use of the `super` keyword without parentheses or arguments. + Represents the use of the `super` keyword without parentheses or arguments, but which might have a block. super ^^^^^ + + super { 123 } + ^^^^^^^^^^^^^ + + If it has any other arguments, it would be a `SuperNode` instead. - name: GlobalVariableAndWriteNode fields: - name: name @@ -2613,7 +2739,7 @@ nodes: - name: name_loc type: location comment: | - The location of the global variable's name. + The Location of the global variable's name. $foo = :bar ^^^^ @@ -2631,7 +2757,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. $foo = :bar ^ @@ -2645,7 +2771,7 @@ nodes: - name: opening_loc type: location comment: | - The location of the opening brace. + The Location of the opening brace. { a => b } ^ @@ -2665,7 +2791,7 @@ nodes: - name: closing_loc type: location comment: | - The location of the closing brace. + The Location of the closing brace. { a => b } ^ @@ -2679,20 +2805,60 @@ nodes: - name: constant type: node? kind: - - ConstantReadNode - ConstantPathNode + - ConstantReadNode + comment: | + Represents the optional constant preceding the Hash. + + foo => Bar[a: 1, b: 2] + ^^^ + + foo => Bar::Baz[a: 1, b: 2] + ^^^^^^^^ - name: elements type: node[] kind: AssocNode + comment: | + Represents the explicit named hash keys and values. + + foo => { a: 1, b:, ** } + ^^^^^^^^ - name: rest type: node? kind: - AssocSplatNode - NoKeywordsParameterNode + comment: | + Represents the rest of the Hash keys and values. This can be named, unnamed, or explicitly forbidden via `**nil`, this last one results in a `NoKeywordsParameterNode`. + + foo => { a: 1, b:, **c } + ^^^ + + foo => { a: 1, b:, ** } + ^^ + + foo => { a: 1, b:, **nil } + ^^^^^ - name: opening_loc type: location? + comment: | + The Location of the opening brace. + + foo => { a: 1 } + ^ + + foo => Bar[a: 1] + ^ - name: closing_loc type: location? + comment: | + The Location of the closing brace. + + foo => { a: 1 } + ^ + + foo => Bar[a: 1] + ^ comment: | Represents a hash pattern in pattern matching. @@ -2701,12 +2867,18 @@ nodes: foo => { a: 1, b: 2, **c } ^^^^^^^^^^^^^^^^^^^ + + foo => Bar[a: 1, b: 2] + ^^^^^^^^^^^^^^^ + + foo in { a: 1, b: 2 } + ^^^^^^^^^^^^^^ - name: IfNode fields: - name: if_keyword_loc type: location? comment: | - The location of the `if` keyword if present. + The Location of the `if` keyword if present. bar if foo ^^ @@ -2731,7 +2903,7 @@ nodes: - name: then_keyword_loc type: location? comment: | - The location of the `then` keyword (if present) or the `?` in a ternary expression, `nil` otherwise. + The Location of the `then` keyword (if present) or the `?` in a ternary expression, `nil` otherwise. if foo then bar end ^^^^ @@ -2772,7 +2944,7 @@ nodes: - name: end_keyword_loc type: location? comment: | - The location of the `end` keyword if present, `nil` otherwise. + The Location of the `end` keyword if present, `nil` otherwise. if foo bar @@ -3057,7 +3229,7 @@ nodes: - name: name_loc type: location comment: | - The location of the variable name. + The Location of the variable name. @_x = 1 ^^^ @@ -3075,7 +3247,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. @x = y ^ @@ -3145,7 +3317,6 @@ nodes: - EmbeddedStatementsNode - EmbeddedVariableNode - InterpolatedStringNode # `"a" "#{b}"` - - on error: XStringNode # `<<`FOO` "bar" - name: closing_loc type: location? newline: parts @@ -3353,6 +3524,9 @@ nodes: foo, bar = baz ^^^ ^^^ + + foo => baz + ^^^ - name: LocalVariableWriteNode fields: - name: name @@ -3376,7 +3550,7 @@ nodes: - name: name_loc type: location comment: | - The location of the variable name. + The Location of the variable name. foo = :bar ^^^ @@ -3398,7 +3572,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `=` operator. + The Location of the `=` operator. x = :y ^ @@ -3443,11 +3617,65 @@ nodes: - name: value type: node kind: non-void expression + comment: | + Represents the left-hand side of the operator. + + foo => bar + ^^^ - name: pattern type: node kind: pattern expression + comment: | + Represents the right-hand side of the operator. The type of the node depends on the expression. + + Anything that looks like a local variable name (including `_`) will result in a `LocalVariableTargetNode`. + + foo => a # This is equivalent to writing `a = foo` + ^ + + Using an explicit `Array` or combining expressions with `,` will result in a `ArrayPatternNode`. This can be preceded by a constant. + + foo => [a] + ^^^ + + foo => a, b + ^^^^ + + foo => Bar[a, b] + ^^^^^^^^^ + + If the array pattern contains at least two wildcard matches, a `FindPatternNode` is created instead. + + foo => *, 1, *a + ^^^^^ + + Using an explicit `Hash` or a constant with square brackets and hash keys in the square brackets will result in a `HashPatternNode`. + + foo => { a: 1, b: } + + foo => Bar[a: 1, b:] + + foo => Bar[**] + + To use any variable that needs run time evaluation, pinning is required. This results in a `PinnedVariableNode` + + foo => ^a + ^^ + + Similar, any expression can be used with pinning. This results in a `PinnedExpressionNode`. + + foo => ^(a + 1) + + Anything else will result in the regular node for that expression, for example a `ConstantReadNode`. + + foo => CONST - name: operator_loc type: location + comment: | + The Location of the operator. + + foo => bar + ^^ comment: | Represents the use of the `=>` operator. @@ -3466,9 +3694,6 @@ nodes: /(?<foo>bar)/ =~ baz ^^^^^^^^^^^^^^^^^^^^ - - name: MissingNode - comment: | - Represents a node that is missing from the source and results in a syntax error. - name: ModuleNode fields: - name: locals @@ -3480,7 +3705,6 @@ nodes: kind: - ConstantReadNode - ConstantPathNode - - on error: MissingNode # module Parent module end - name: body type: node? kind: @@ -3510,8 +3734,6 @@ nodes: - IndexTargetNode - MultiTargetNode - RequiredParameterNode # def m((a,b)); end - - on error: BackReferenceReadNode # a, (b, $&) = z - - on error: NumberedReferenceReadNode # a, (b, $1) = z comment: | Represents the targets expressions before a splat node. @@ -3555,8 +3777,6 @@ nodes: - IndexTargetNode - MultiTargetNode - RequiredParameterNode # def m((*,b)); end - - on error: BackReferenceReadNode # a, (*, $&) = z - - on error: NumberedReferenceReadNode # a, (*, $1) = z comment: | Represents the targets expressions after a splat node. @@ -3565,14 +3785,14 @@ nodes: - name: lparen_loc type: location? comment: | - The location of the opening parenthesis. + The Location of the opening parenthesis. a, (b, c) = 1, 2, 3 ^ - name: rparen_loc type: location? comment: | - The location of the closing parenthesis. + The Location of the closing parenthesis. a, (b, c) = 1, 2, 3 ^ @@ -3600,8 +3820,6 @@ nodes: - CallTargetNode - IndexTargetNode - MultiTargetNode - - on error: BackReferenceReadNode # $&, = z - - on error: NumberedReferenceReadNode # $1, = z comment: | Represents the targets expressions before a splat node. @@ -3644,8 +3862,6 @@ nodes: - CallTargetNode - IndexTargetNode - MultiTargetNode - - on error: BackReferenceReadNode # *, $& = z - - on error: NumberedReferenceReadNode # *, $1 = z comment: | Represents the targets expressions after a splat node. @@ -3654,21 +3870,21 @@ nodes: - name: lparen_loc type: location? comment: | - The location of the opening parenthesis. + The Location of the opening parenthesis. (a, b, c) = 1, 2, 3 ^ - name: rparen_loc type: location? comment: | - The location of the closing parenthesis. + The Location of the closing parenthesis. (a, b, c) = 1, 2, 3 ^ - name: operator_loc type: location comment: | - The location of the operator. + The Location of the operator. a, b, c = 1, 2, 3 ^ @@ -3703,6 +3919,18 @@ nodes: nil ^^^ + - name: NoBlockParameterNode + fields: + - name: operator_loc + type: location + - name: keyword_loc + type: location + comment: | + Represents the use of `&nil` inside method arguments. + + def a(&nil) + ^^^^ + end - name: NoKeywordsParameterNode fields: - name: operator_loc @@ -3802,7 +4030,7 @@ nodes: - name: operator_loc type: location comment: | - The location of the `or` keyword or the `||` operator. + The Location of the `or` keyword or the `||` operator. left or right ^^ @@ -3831,11 +4059,6 @@ nodes: kind: - RequiredParameterNode - MultiTargetNode - # On parsing error of `f(**kwargs, ...)` or `f(**nil, ...)`, the keyword_rest value is moved here: - - on error: KeywordRestParameterNode - - on error: NoKeywordsParameterNode - # On parsing error of `f(..., ...)`, the first forwarding parameter is moved here: - - on error: ForwardingParameterNode - name: keywords type: node[] kind: @@ -3849,7 +4072,9 @@ nodes: - NoKeywordsParameterNode - name: block type: node? - kind: BlockParameterNode + kind: + - BlockParameterNode + - NoBlockParameterNode comment: | Represents the list of parameters on a method, block, or lambda definition. @@ -3877,12 +4102,32 @@ nodes: - name: expression type: node kind: non-void expression + comment: | + The expression used in the pinned expression + + foo in ^(bar) + ^^^ - name: operator_loc type: location + comment: | + The Location of the `^` operator + + foo in ^(bar) + ^ - name: lparen_loc type: location + comment: | + The Location of the opening parenthesis. + + foo in ^(bar) + ^ - name: rparen_loc type: location + comment: | + The Location of the closing parenthesis. + + foo in ^(bar) + ^ comment: | Represents the use of the `^` operator for pinning an expression in a pattern matching expression. @@ -3900,9 +4145,18 @@ nodes: - BackReferenceReadNode # foo in ^$& - NumberedReferenceReadNode # foo in ^$1 - ItLocalVariableReadNode # proc { 1 in ^it } - - on error: MissingNode # foo in ^Bar + comment: | + The variable used in the pinned expression + + foo in ^bar + ^^^ - name: operator_loc type: location + comment: | + The Location of the `^` operator + + foo in ^bar + ^ comment: | Represents the use of the `^` operator for pinning a variable in a pattern matching expression. @@ -3973,11 +4227,11 @@ nodes: 1...foo ^^^ - If neither right-hand or left-hand side was included, this will be a MissingNode. + If neither right-hand or left-hand side was included, this will be an ErrorRecoveryNode. - name: operator_loc type: location comment: | - The location of the `..` or `...` operator. + The Location of the `..` or `...` operator. comment: | Represents the use of the `..` or `...` operators. @@ -4088,9 +4342,6 @@ nodes: - ConstantPathTargetNode - CallTargetNode - IndexTargetNode - - on error: BackReferenceReadNode # => begin; rescue => $&; end - - on error: NumberedReferenceReadNode # => begin; rescue => $1; end - - on error: MissingNode # begin; rescue =>; end - name: then_keyword_loc type: location? - name: statements @@ -4203,7 +4454,7 @@ nodes: fields: - name: filepath type: string - comment: Represents the file path being parsed. This corresponds directly to the `filepath` option given to the various `Prism::parse*` APIs. + comment: Represents the file path being parsed. This corresponds directly to the `filepath` option given to the various `Prism.parse*` APIs. comment: | Represents the use of the `__FILE__` keyword. @@ -4268,6 +4519,7 @@ nodes: - name: arguments type: node? kind: ArgumentsNode + comment: "Can be only `nil` when there are empty parentheses, like `super()`." - name: rparen_loc type: location? - name: block @@ -4283,6 +4535,8 @@ nodes: super foo, bar ^^^^^^^^^^^^^^ + + If no arguments are provided (except for a block), it would be a `ForwardingSuperNode` instead. - name: SymbolNode flags: SymbolFlags fields: @@ -4327,7 +4581,7 @@ nodes: - name: keyword_loc type: location comment: | - The location of the `unless` keyword. + The Location of the `unless` keyword. unless cond then bar end ^^^^^^ @@ -4348,7 +4602,7 @@ nodes: - name: then_keyword_loc type: location? comment: | - The location of the `then` keyword, if present. + The Location of the `then` keyword, if present. unless cond then bar end ^^^^ @@ -4368,11 +4622,11 @@ nodes: The else clause of the unless expression, if present. unless cond then bar else baz end - ^^^^^^^^ + ^^^^^^^^^^^^ - name: end_keyword_loc type: location? comment: | - The location of the `end` keyword, if present. + The Location of the `end` keyword, if present. unless cond then bar end ^^^ diff --git a/prism/util/pm_constant_pool.c b/prism/constant_pool.c index 38ea01a228..90201ebb8e 100644 --- a/prism/util/pm_constant_pool.c +++ b/prism/constant_pool.c @@ -1,4 +1,11 @@ -#include "prism/util/pm_constant_pool.h" +#include "prism/internal/constant_pool.h" + +#include "prism/compiler/align.h" +#include "prism/compiler/inline.h" +#include "prism/internal/arena.h" + +#include <assert.h> +#include <stdbool.h> /** * Initialize a list of constant ids. @@ -14,10 +21,9 @@ pm_constant_id_list_init(pm_constant_id_list_t *list) { * Initialize a list of constant ids with a given capacity. */ void -pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity) { +pm_constant_id_list_init_capacity(pm_arena_t *arena, pm_constant_id_list_t *list, size_t capacity) { if (capacity) { - list->ids = xcalloc(capacity, sizeof(pm_constant_id_t)); - if (list->ids == NULL) abort(); + list->ids = (pm_constant_id_t *) pm_arena_zalloc(arena, capacity * sizeof(pm_constant_id_t), PRISM_ALIGNOF(pm_constant_id_t)); } else { list->ids = NULL; } @@ -27,19 +33,23 @@ pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity) } /** - * Append a constant id to a list of constant ids. Returns false if any - * potential reallocations fail. + * Append a constant id to a list of constant ids. */ -bool -pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id) { +void +pm_constant_id_list_append(pm_arena_t *arena, pm_constant_id_list_t *list, pm_constant_id_t id) { if (list->size >= list->capacity) { - list->capacity = list->capacity == 0 ? 8 : list->capacity * 2; - list->ids = (pm_constant_id_t *) xrealloc(list->ids, sizeof(pm_constant_id_t) * list->capacity); - if (list->ids == NULL) return false; + size_t new_capacity = list->capacity == 0 ? 8 : list->capacity * 2; + pm_constant_id_t *new_ids = (pm_constant_id_t *) pm_arena_alloc(arena, sizeof(pm_constant_id_t) * new_capacity, PRISM_ALIGNOF(pm_constant_id_t)); + + if (list->size > 0) { + memcpy(new_ids, list->ids, list->size * sizeof(pm_constant_id_t)); + } + + list->ids = new_ids; + list->capacity = new_capacity; } list->ids[list->size++] = id; - return true; } /** @@ -66,29 +76,66 @@ pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id) { } /** - * Free the memory associated with a list of constant ids. + * A multiply-xorshift hash that processes input a word at a time. This is + * significantly faster than the byte-at-a-time djb2 hash for the short strings + * typical in Ruby source (~15 bytes average). Each word is mixed into the hash + * by XOR followed by multiplication by a large odd constant, which spreads + * entropy across all bits. A final xorshift fold produces the 32-bit result. */ -void -pm_constant_id_list_free(pm_constant_id_list_t *list) { - if (list->ids != NULL) { - xfree(list->ids); - } -} - -/** - * A relatively simple hash function (djb2) that is used to hash strings. We are - * optimizing here for simplicity and speed. - */ -static inline uint32_t +static PRISM_INLINE uint32_t pm_constant_pool_hash(const uint8_t *start, size_t length) { - // This is a prime number used as the initial value for the hash function. - uint32_t value = 5381; + // This constant is borrowed from wyhash. It is a 64-bit odd integer with + // roughly equal 0/1 bits, chosen for good avalanche behavior when used in + // multiply-xorshift sequences. + static const uint64_t secret = 0x517cc1b727220a95ULL; + uint64_t hash = (uint64_t) length; + + if (length <= 8) { + // Short strings: read first and last 4 bytes (overlapping for len < 8). + // This covers the majority of Ruby identifiers with a single multiply. + if (length >= 4) { + uint32_t a, b; + memcpy(&a, start, 4); + memcpy(&b, start + length - 4, 4); + hash ^= (uint64_t) a | ((uint64_t) b << 32); + } else if (length > 0) { + hash ^= (uint64_t) start[0] | ((uint64_t) start[length >> 1] << 8) | ((uint64_t) start[length - 1] << 16); + } + hash *= secret; + } else if (length <= 16) { + // Medium strings: read first and last 8 bytes (overlapping). + // Two multiplies instead of the three the loop-based approach needs. + uint64_t word; + memcpy(&word, start, 8); + hash ^= word; + hash *= secret; + memcpy(&word, start + length - 8, 8); + hash ^= word; + hash *= secret; + } else { + const uint8_t *ptr = start; + size_t remaining = length; + + while (remaining >= 8) { + uint64_t word; + memcpy(&word, ptr, 8); + hash ^= word; + hash *= secret; + ptr += 8; + remaining -= 8; + } - for (size_t index = 0; index < length; index++) { - value = ((value << 5) + value) + start[index]; + if (remaining > 0) { + // Read the last 8 bytes (overlapping with already-processed data). + uint64_t word; + memcpy(&word, start + length - 8, 8); + hash ^= word; + hash *= secret; + } } - return value; + hash ^= hash >> 32; + return (uint32_t) hash; } /** @@ -121,21 +168,15 @@ is_power_of_two(uint32_t size) { /** * Resize a constant pool to a given capacity. */ -static inline bool -pm_constant_pool_resize(pm_constant_pool_t *pool) { +static PRISM_INLINE void +pm_constant_pool_resize(pm_arena_t *arena, pm_constant_pool_t *pool) { assert(is_power_of_two(pool->capacity)); uint32_t next_capacity = pool->capacity * 2; - if (next_capacity < pool->capacity) return false; - const uint32_t mask = next_capacity - 1; - const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); - - void *next = xcalloc(next_capacity, element_size); - if (next == NULL) return false; - pm_constant_pool_bucket_t *next_buckets = next; - pm_constant_t *next_constants = (void *)(((char *) next) + next_capacity * sizeof(pm_constant_pool_bucket_t)); + pm_constant_pool_bucket_t *next_buckets = (pm_constant_pool_bucket_t *) pm_arena_zalloc(arena, next_capacity * sizeof(pm_constant_pool_bucket_t), PRISM_ALIGNOF(pm_constant_pool_bucket_t)); + pm_constant_t *next_constants = (pm_constant_t *) pm_arena_alloc(arena, next_capacity * sizeof(pm_constant_t), PRISM_ALIGNOF(pm_constant_t)); // For each bucket in the current constant pool, find the index in the // next constant pool, and insert it. @@ -163,33 +204,22 @@ pm_constant_pool_resize(pm_constant_pool_t *pool) { // The constants are stable with respect to hash table resizes. memcpy(next_constants, pool->constants, pool->size * sizeof(pm_constant_t)); - // pool->constants and pool->buckets are allocated out of the same chunk - // of memory, with the buckets coming first. - xfree(pool->buckets); pool->constants = next_constants; pool->buckets = next_buckets; pool->capacity = next_capacity; - return true; } /** * Initialize a new constant pool with a given capacity. */ -bool -pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity) { - const uint32_t maximum = (~((uint32_t) 0)); - if (capacity >= ((maximum / 2) + 1)) return false; - +void +pm_constant_pool_init(pm_arena_t *arena, pm_constant_pool_t *pool, uint32_t capacity) { capacity = next_power_of_two(capacity); - const size_t element_size = sizeof(pm_constant_pool_bucket_t) + sizeof(pm_constant_t); - void *memory = xcalloc(capacity, element_size); - if (memory == NULL) return false; - pool->buckets = memory; - pool->constants = (void *)(((char *)memory) + capacity * sizeof(pm_constant_pool_bucket_t)); + pool->buckets = (pm_constant_pool_bucket_t *) pm_arena_zalloc(arena, capacity * sizeof(pm_constant_pool_bucket_t), PRISM_ALIGNOF(pm_constant_pool_bucket_t)); + pool->constants = (pm_constant_t *) pm_arena_alloc(arena, capacity * sizeof(pm_constant_t), PRISM_ALIGNOF(pm_constant_t)); pool->size = 0; pool->capacity = capacity; - return true; } /** @@ -215,8 +245,7 @@ pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size pm_constant_pool_bucket_t *bucket; while (bucket = &pool->buckets[index], bucket->id != PM_CONSTANT_ID_UNSET) { - pm_constant_t *constant = &pool->constants[bucket->id - 1]; - if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + if ((bucket->length == length) && memcmp(bucket->start, start, length) == 0) { return bucket->id; } @@ -229,10 +258,10 @@ pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size /** * Insert a constant into a constant pool and return its index in the pool. */ -static inline pm_constant_id_t -pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t length, pm_constant_pool_bucket_type_t type) { +static PRISM_INLINE pm_constant_id_t +pm_constant_pool_insert(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length, pm_constant_pool_bucket_type_t type) { if (pool->size >= (pool->capacity / 4 * 3)) { - if (!pm_constant_pool_resize(pool)) return PM_CONSTANT_ID_UNSET; + pm_constant_pool_resize(arena, pool); } assert(is_power_of_two(pool->capacity)); @@ -246,25 +275,17 @@ pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t l // If there is a collision, then we need to check if the content is the // same as the content we are trying to insert. If it is, then we can // return the id of the existing constant. - pm_constant_t *constant = &pool->constants[bucket->id - 1]; - - if ((constant->length == length) && memcmp(constant->start, start, length) == 0) { + if ((bucket->length == length) && memcmp(bucket->start, start, length) == 0) { // Since we have found a match, we need to check if this is // attempting to insert a shared or an owned constant. We want to // prefer shared constants since they don't require allocations. - if (type == PM_CONSTANT_POOL_BUCKET_OWNED) { - // If we're attempting to insert an owned constant and we have - // an existing constant, then either way we don't want the given - // memory. Either it's duplicated with the existing constant or - // it's not necessary because we have a shared version. - xfree((void *) start); - } else if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { + if (type != PM_CONSTANT_POOL_BUCKET_OWNED && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { // If we're attempting to insert a shared constant and the - // existing constant is owned, then we can free the owned - // constant and replace it with the shared constant. - xfree((void *) constant->start); - constant->start = start; - bucket->type = (unsigned int) (PM_CONSTANT_POOL_BUCKET_DEFAULT & 0x3); + // existing constant is owned, then we can replace it with the + // shared constant to prefer non-owned references. + bucket->start = start; + bucket->type = (unsigned int) (type & 0x3); + pool->constants[bucket->id - 1].start = start; } return bucket->id; @@ -281,7 +302,9 @@ pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t l *bucket = (pm_constant_pool_bucket_t) { .id = (unsigned int) (id & 0x3fffffff), .type = (unsigned int) (type & 0x3), - .hash = hash + .hash = hash, + .start = start, + .length = length }; pool->constants[id - 1] = (pm_constant_t) { @@ -297,8 +320,8 @@ pm_constant_pool_insert(pm_constant_pool_t *pool, const uint8_t *start, size_t l * PM_CONSTANT_ID_UNSET if any potential calls to resize fail. */ pm_constant_id_t -pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { - return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_DEFAULT); +pm_constant_pool_insert_shared(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(arena, pool, start, length, PM_CONSTANT_POOL_BUCKET_DEFAULT); } /** @@ -307,8 +330,8 @@ pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, s * potential calls to resize fail. */ pm_constant_id_t -pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length) { - return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_OWNED); +pm_constant_pool_insert_owned(pm_arena_t *arena, pm_constant_pool_t *pool, uint8_t *start, size_t length) { + return pm_constant_pool_insert(arena, pool, start, length, PM_CONSTANT_POOL_BUCKET_OWNED); } /** @@ -317,26 +340,21 @@ pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t l * resize fail. */ pm_constant_id_t -pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length) { - return pm_constant_pool_insert(pool, start, length, PM_CONSTANT_POOL_BUCKET_CONSTANT); +pm_constant_pool_insert_constant(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length) { + return pm_constant_pool_insert(arena, pool, start, length, PM_CONSTANT_POOL_BUCKET_CONSTANT); } /** - * Free the memory associated with a constant pool. + * Return a raw pointer to the start of a constant. */ -void -pm_constant_pool_free(pm_constant_pool_t *pool) { - // For each constant in the current constant pool, free the contents if the - // contents are owned. - for (uint32_t index = 0; index < pool->capacity; index++) { - pm_constant_pool_bucket_t *bucket = &pool->buckets[index]; - - // If an id is set on this constant, then we know we have content here. - if (bucket->id != PM_CONSTANT_ID_UNSET && bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED) { - pm_constant_t *constant = &pool->constants[bucket->id - 1]; - xfree((void *) constant->start); - } - } +const uint8_t * +pm_constant_start(const pm_constant_t *constant) { + return constant->start; +} - xfree(pool->buckets); +/** + * Return the length of a constant. + */ +size_t pm_constant_length(const pm_constant_t *constant) { + return constant->length; } diff --git a/prism/constant_pool.h b/prism/constant_pool.h new file mode 100644 index 0000000000..dc03235c70 --- /dev/null +++ b/prism/constant_pool.h @@ -0,0 +1,81 @@ +/** + * @file constant_pool.h + * + * A data structure that stores a set of strings. + * + * Each string is assigned a unique id, which can be used to compare strings for + * equality. This comparison ends up being much faster than strcmp, since it + * only requires a single integer comparison. + */ +#ifndef PRISM_CONSTANT_POOL_H +#define PRISM_CONSTANT_POOL_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include "prism/arena.h" + +#include <stddef.h> +#include <stdint.h> + +/** + * A constant id is a unique identifier for a constant in the constant pool. + */ +typedef uint32_t pm_constant_id_t; + +/** + * A list of constant IDs. Usually used to represent a set of locals. + */ +typedef struct { + /** The number of constant ids in the list. */ + size_t size; + + /** The number of constant ids that have been allocated in the list. */ + size_t capacity; + + /** The constant ids in the list. */ + pm_constant_id_t *ids; +} pm_constant_id_list_t; + +/** A constant in the pool which effectively stores a string. */ +typedef struct pm_constant_t pm_constant_t; + +/** + * The overall constant pool, which stores constants found while parsing. + */ +typedef struct pm_constant_pool_t pm_constant_pool_t; + +/** + * Return a raw pointer to the start of a constant. + * + * @param constant The constant to get the start of. + * @returns A raw pointer to the start of the constant. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_constant_start(const pm_constant_t *constant) PRISM_NONNULL(1); + +/** + * Return the length of a constant. + * + * @param constant The constant to get the length of. + * @returns The length of the constant. + */ +PRISM_EXPORTED_FUNCTION size_t pm_constant_length(const pm_constant_t *constant) PRISM_NONNULL(1); + +/** + * Initialize a list of constant ids. + * + * @param list The list to initialize. + */ +PRISM_EXPORTED_FUNCTION void pm_constant_id_list_init(pm_constant_id_list_t *list) PRISM_NONNULL(1); + +/** + * Append a constant id to a list of constant ids. + * + * @param arena The arena to use for allocations. + * @param list The list to append to. + * @param id The constant id to append. + */ +PRISM_EXPORTED_FUNCTION void pm_constant_id_list_append(pm_arena_t *arena, pm_constant_id_list_t *list, pm_constant_id_t id) PRISM_NONNULL(1, 2); + +#endif diff --git a/prism/defines.h b/prism/defines.h deleted file mode 100644 index e31429c789..0000000000 --- a/prism/defines.h +++ /dev/null @@ -1,260 +0,0 @@ -/** - * @file defines.h - * - * Macro definitions used throughout the prism library. - * - * This file should be included first by any *.h or *.c in prism for consistency - * and to ensure that the macros are defined before they are used. - */ -#ifndef PRISM_DEFINES_H -#define PRISM_DEFINES_H - -#include <ctype.h> -#include <limits.h> -#include <math.h> -#include <stdarg.h> -#include <stddef.h> -#include <stdint.h> -#include <stdio.h> -#include <string.h> - -/** - * We want to be able to use the PRI* macros for printing out integers, but on - * some platforms they aren't included unless this is already defined. - */ -#define __STDC_FORMAT_MACROS -// Include sys/types.h before inttypes.h to work around issue with -// certain versions of GCC and newlib which causes omission of PRIx64 -#include <sys/types.h> -#include <inttypes.h> - -/** - * When we are parsing using recursive descent, we want to protect against - * malicious payloads that could attempt to crash our parser. We do this by - * specifying a maximum depth to which we are allowed to recurse. - */ -#ifndef PRISM_DEPTH_MAXIMUM - #define PRISM_DEPTH_MAXIMUM 10000 -#endif - -/** - * By default, we compile with -fvisibility=hidden. When this is enabled, we - * need to mark certain functions as being publically-visible. This macro does - * that in a compiler-agnostic way. - */ -#ifndef PRISM_EXPORTED_FUNCTION -# ifdef PRISM_EXPORT_SYMBOLS -# ifdef _WIN32 -# define PRISM_EXPORTED_FUNCTION __declspec(dllexport) extern -# else -# define PRISM_EXPORTED_FUNCTION __attribute__((__visibility__("default"))) extern -# endif -# else -# define PRISM_EXPORTED_FUNCTION -# endif -#endif - -/** - * Certain compilers support specifying that a function accepts variadic - * parameters that look like printf format strings to provide a better developer - * experience when someone is using the function. This macro does that in a - * compiler-agnostic way. - */ -#if defined(__GNUC__) -# if defined(__MINGW_PRINTF_FORMAT) -# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((format(__MINGW_PRINTF_FORMAT, string_index, argument_index))) -# else -# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((format(printf, string_index, argument_index))) -# endif -#elif defined(__clang__) -# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) __attribute__((__format__(__printf__, string_index, argument_index))) -#else -# define PRISM_ATTRIBUTE_FORMAT(string_index, argument_index) -#endif - -/** - * GCC will warn if you specify a function or parameter that is unused at - * runtime. This macro allows you to mark a function or parameter as unused in a - * compiler-agnostic way. - */ -#if defined(__GNUC__) -# define PRISM_ATTRIBUTE_UNUSED __attribute__((unused)) -#else -# define PRISM_ATTRIBUTE_UNUSED -#endif - -/** - * Old Visual Studio versions do not support the inline keyword, so we need to - * define it to be __inline. - */ -#if defined(_MSC_VER) && !defined(inline) -# define inline __inline -#endif - -/** - * Old Visual Studio versions before 2015 do not implement sprintf, but instead - * implement _snprintf. We standard that here. - */ -#if !defined(snprintf) && defined(_MSC_VER) && (_MSC_VER < 1900) -# define snprintf _snprintf -#endif - -/** - * A simple utility macro to concatenate two tokens together, necessary when one - * of the tokens is itself a macro. - */ -#define PM_CONCATENATE(left, right) left ## right - -/** - * We want to be able to use static assertions, but they weren't standardized - * until C11. As such, we polyfill it here by making a hacky typedef that will - * fail to compile due to a negative array size if the condition is false. - */ -#if defined(_Static_assert) -# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message) -#else -# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1] -#endif - -/** - * In general, libc for embedded systems does not support memory-mapped files. - * If the target platform is POSIX or Windows, we can map a file in memory and - * read it in a more efficient manner. - */ -#ifdef _WIN32 -# define PRISM_HAS_MMAP -#else -# include <unistd.h> -# ifdef _POSIX_MAPPED_FILES -# define PRISM_HAS_MMAP -# endif -#endif - -/** - * If PRISM_HAS_NO_FILESYSTEM is defined, then we want to exclude all filesystem - * related code from the library. All filesystem related code should be guarded - * by PRISM_HAS_FILESYSTEM. - */ -#ifndef PRISM_HAS_NO_FILESYSTEM -# define PRISM_HAS_FILESYSTEM -#endif - -/** - * isinf on POSIX systems it accepts a float, a double, or a long double. - * But mingw didn't provide an isinf macro, only an isinf function that only - * accepts floats, so we need to use _finite instead. - */ -#ifdef __MINGW64__ - #include <float.h> - #define PRISM_ISINF(x) (!_finite(x)) -#else - #define PRISM_ISINF(x) isinf(x) -#endif - -/** - * If you build prism with a custom allocator, configure it with - * "-D PRISM_XALLOCATOR" to use your own allocator that defines xmalloc, - * xrealloc, xcalloc, and xfree. - * - * For example, your `prism_xallocator.h` file could look like this: - * - * ``` - * #ifndef PRISM_XALLOCATOR_H - * #define PRISM_XALLOCATOR_H - * #define xmalloc my_malloc - * #define xrealloc my_realloc - * #define xcalloc my_calloc - * #define xfree my_free - * #endif - * ``` - */ -#ifdef PRISM_XALLOCATOR - #include "prism_xallocator.h" -#else - #ifndef xmalloc - /** - * The malloc function that should be used. This can be overridden with - * the PRISM_XALLOCATOR define. - */ - #define xmalloc malloc - #endif - - #ifndef xrealloc - /** - * The realloc function that should be used. This can be overridden with - * the PRISM_XALLOCATOR define. - */ - #define xrealloc realloc - #endif - - #ifndef xcalloc - /** - * The calloc function that should be used. This can be overridden with - * the PRISM_XALLOCATOR define. - */ - #define xcalloc calloc - #endif - - #ifndef xfree - /** - * The free function that should be used. This can be overridden with the - * PRISM_XALLOCATOR define. - */ - #define xfree free - #endif -#endif - -/** - * If PRISM_BUILD_MINIMAL is defined, then we're going to define every possible - * switch that will turn off certain features of prism. - */ -#ifdef PRISM_BUILD_MINIMAL - /** Exclude the serialization API. */ - #define PRISM_EXCLUDE_SERIALIZATION - - /** Exclude the JSON serialization API. */ - #define PRISM_EXCLUDE_JSON - - /** Exclude the Array#pack parser API. */ - #define PRISM_EXCLUDE_PACK - - /** Exclude the prettyprint API. */ - #define PRISM_EXCLUDE_PRETTYPRINT - - /** Exclude the full set of encodings, using the minimal only. */ - #define PRISM_ENCODING_EXCLUDE_FULL -#endif - -/** - * Support PRISM_LIKELY and PRISM_UNLIKELY to help the compiler optimize its - * branch predication. - */ -#if defined(__GNUC__) || defined(__clang__) - /** The compiler should predicate that this branch will be taken. */ - #define PRISM_LIKELY(x) __builtin_expect(!!(x), 1) - - /** The compiler should predicate that this branch will not be taken. */ - #define PRISM_UNLIKELY(x) __builtin_expect(!!(x), 0) -#else - /** Void because this platform does not support branch prediction hints. */ - #define PRISM_LIKELY(x) (x) - - /** Void because this platform does not support branch prediction hints. */ - #define PRISM_UNLIKELY(x) (x) -#endif - -/** - * We use -Wimplicit-fallthrough to guard potentially unintended fall-through between cases of a switch. - * Use PRISM_FALLTHROUGH to explicitly annotate cases where the fallthrough is intentional. - */ -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L // C23 or later - #define PRISM_FALLTHROUGH [[fallthrough]]; -#elif defined(__GNUC__) || defined(__clang__) - #define PRISM_FALLTHROUGH __attribute__((fallthrough)); -#elif defined(_MSC_VER) - #define PRISM_FALLTHROUGH __fallthrough; -#else - #define PRISM_FALLTHROUGH -#endif - -#endif diff --git a/prism/diagnostic.h b/prism/diagnostic.h new file mode 100644 index 0000000000..370061ec56 --- /dev/null +++ b/prism/diagnostic.h @@ -0,0 +1,93 @@ +/** + * @file diagnostic.h + * + * A list of diagnostics generated during parsing. + */ +#ifndef PRISM_DIAGNOSTIC_H +#define PRISM_DIAGNOSTIC_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include "prism/ast.h" + +/** + * An opaque pointer to a diagnostic generated during parsing. + */ +typedef struct pm_diagnostic_t pm_diagnostic_t; + +/** + * The levels of errors generated during parsing. + */ +typedef enum { + /** For errors that should raise a syntax error. */ + PM_ERROR_LEVEL_SYNTAX = 0, + + /** For errors that should raise an argument error. */ + PM_ERROR_LEVEL_ARGUMENT = 1, + + /** For errors that should raise a load error. */ + PM_ERROR_LEVEL_LOAD = 2 +} pm_error_level_t; + +/** + * The levels of warnings generated during parsing. + */ +typedef enum { + /** For warnings which should be emitted if $VERBOSE != nil. */ + PM_WARNING_LEVEL_DEFAULT = 0, + + /** For warnings which should be emitted if $VERBOSE == true. */ + PM_WARNING_LEVEL_VERBOSE = 1 +} pm_warning_level_t; + +/** + * Get the type of the given diagnostic. + * + * @param diagnostic The diagnostic to get the type of. + * @returns The type of the given diagnostic. Note that this is a string + * representation of an internal ID, and is not meant to be relied upon as a + * stable identifier for the diagnostic. We do not guarantee that these will + * not change in the future. This is meant to be used for debugging and + * error reporting purposes, and not for programmatic checks. + */ +PRISM_EXPORTED_FUNCTION const char * pm_diagnostic_type(const pm_diagnostic_t *diagnostic) PRISM_NONNULL(1); + +/** + * Get the location of the given diagnostic. + * + * @param diagnostic The diagnostic to get the location of. + * @returns The location of the given diagnostic. + */ +PRISM_EXPORTED_FUNCTION pm_location_t pm_diagnostic_location(const pm_diagnostic_t *diagnostic) PRISM_NONNULL(1); + +/** + * Get the message of the given diagnostic. + * + * @param diagnostic The diagnostic to get the message of. + * @returns The message of the given diagnostic. + */ +PRISM_EXPORTED_FUNCTION const char * pm_diagnostic_message(const pm_diagnostic_t *diagnostic) PRISM_NONNULL(1); + +/** + * Get the error level associated with the given diagnostic. + * + * @param diagnostic The diagnostic to get the error level of. + * @returns The error level of the given diagnostic. If the diagnostic was a + * warning, or is in any way not an error, then the return value is + * undefined and should not be relied upon. + */ +PRISM_EXPORTED_FUNCTION pm_error_level_t pm_diagnostic_error_level(const pm_diagnostic_t *diagnostic) PRISM_NONNULL(1); + +/** + * Get the warning level associated with the given diagnostic. + * + * @param diagnostic The diagnostic to get the warning level of. + * @returns The warning level of the given diagnostic. If the diagnostic was an + * error, or is in any way not a warning, then the return value is + * undefined and should not be relied upon. + */ +PRISM_EXPORTED_FUNCTION pm_warning_level_t pm_diagnostic_warning_level(const pm_diagnostic_t *diagnostic) PRISM_NONNULL(1); + +#endif diff --git a/prism/encoding.c b/prism/encoding.c index a4aeed104f..c9c2e13056 100644 --- a/prism/encoding.c +++ b/prism/encoding.c @@ -1,8 +1,13 @@ -#include "prism/encoding.h" +#include "prism/internal/encoding.h" + +#include "prism/compiler/unused.h" +#include "prism/internal/strncasecmp.h" + +#include <assert.h> typedef uint32_t pm_unicode_codepoint_t; -#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1450 +#define UNICODE_ALPHA_CODEPOINTS_LENGTH 1508 static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEPOINTS_LENGTH] = { 0x100, 0x2C1, 0x2C6, 0x2D1, @@ -10,7 +15,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x2EC, 0x2EC, 0x2EE, 0x2EE, 0x345, 0x345, - 0x370, 0x374, + 0x363, 0x374, 0x376, 0x377, 0x37A, 0x37D, 0x37F, 0x37F, @@ -50,7 +55,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x840, 0x858, 0x860, 0x86A, 0x870, 0x887, - 0x889, 0x88E, + 0x889, 0x88F, + 0x897, 0x897, 0x8A0, 0x8C9, 0x8D4, 0x8DF, 0x8E3, 0x8E9, @@ -140,7 +146,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xC4A, 0xC4C, 0xC55, 0xC56, 0xC58, 0xC5A, - 0xC5D, 0xC5D, + 0xC5C, 0xC5D, 0xC60, 0xC63, 0xC80, 0xC83, 0xC85, 0xC8C, @@ -152,7 +158,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xCC6, 0xCC8, 0xCCA, 0xCCC, 0xCD5, 0xCD6, - 0xCDD, 0xCDE, + 0xCDC, 0xCDE, 0xCE0, 0xCE3, 0xCF1, 0xCF3, 0xD00, 0xD0C, @@ -264,7 +270,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1C00, 0x1C36, 0x1C4D, 0x1C4F, 0x1C5A, 0x1C7D, - 0x1C80, 0x1C88, + 0x1C80, 0x1C8A, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1CE9, 0x1CEC, @@ -272,7 +278,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1CF5, 0x1CF6, 0x1CFA, 0x1CFA, 0x1D00, 0x1DBF, - 0x1DE7, 0x1DF4, + 0x1DD3, 0x1DF4, 0x1E00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, @@ -352,11 +358,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0xA67F, 0xA6EF, 0xA717, 0xA71F, 0xA722, 0xA788, - 0xA78B, 0xA7CA, - 0xA7D0, 0xA7D1, - 0xA7D3, 0xA7D3, - 0xA7D5, 0xA7D9, - 0xA7F2, 0xA805, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, 0xA807, 0xA827, 0xA840, 0xA873, 0xA880, 0xA8C3, @@ -446,6 +449,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x105A3, 0x105B1, 0x105B3, 0x105B9, 0x105BB, 0x105BC, + 0x105C0, 0x105F3, 0x10600, 0x10736, 0x10740, 0x10755, 0x10760, 0x10767, @@ -464,6 +468,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x108F4, 0x108F5, 0x10900, 0x10915, 0x10920, 0x10939, + 0x10940, 0x10959, 0x10980, 0x109B7, 0x109BE, 0x109BF, 0x10A00, 0x10A03, @@ -483,9 +488,14 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x10C80, 0x10CB2, 0x10CC0, 0x10CF2, 0x10D00, 0x10D27, + 0x10D4A, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, 0x10E80, 0x10EA9, 0x10EAB, 0x10EAC, 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, 0x10F00, 0x10F1C, 0x10F27, 0x10F27, 0x10F30, 0x10F45, @@ -529,6 +539,17 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11350, 0x11350, 0x11357, 0x11357, 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, 0x11400, 0x11441, 0x11443, 0x11445, 0x11447, 0x1144A, @@ -567,6 +588,8 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11A50, 0x11A97, 0x11A9D, 0x11A9D, 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, 0x11C00, 0x11C08, 0x11C0A, 0x11C36, 0x11C38, 0x11C3E, @@ -588,6 +611,7 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x11D90, 0x11D91, 0x11D93, 0x11D96, 0x11D98, 0x11D98, + 0x11DB0, 0x11DDB, 0x11EE0, 0x11EF6, 0x11F00, 0x11F10, 0x11F12, 0x11F3A, @@ -599,7 +623,9 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x12F90, 0x12FF0, 0x13000, 0x1342F, 0x13441, 0x13446, + 0x13460, 0x143FA, 0x14400, 0x14646, + 0x16100, 0x1612E, 0x16800, 0x16A38, 0x16A40, 0x16A5E, 0x16A70, 0x16ABE, @@ -608,16 +634,19 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x16B40, 0x16B43, 0x16B63, 0x16B77, 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, 0x16F00, 0x16F4A, 0x16F4F, 0x16F87, 0x16F8F, 0x16F9F, 0x16FE0, 0x16FE1, 0x16FE3, 0x16FE3, - 0x16FF0, 0x16FF1, - 0x17000, 0x187F7, - 0x18800, 0x18CD5, - 0x18D00, 0x18D08, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, 0x1AFF0, 0x1AFF3, 0x1AFF5, 0x1AFFB, 0x1AFFD, 0x1AFFE, @@ -677,6 +706,11 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1E290, 0x1E2AD, 0x1E2C0, 0x1E2EB, 0x1E4D0, 0x1E4EB, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5F0, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, 0x1E7E0, 0x1E7E6, 0x1E7E8, 0x1E7EB, 0x1E7ED, 0x1E7EE, @@ -722,16 +756,16 @@ static const pm_unicode_codepoint_t unicode_alpha_codepoints[UNICODE_ALPHA_CODEP 0x1F150, 0x1F169, 0x1F170, 0x1F189, 0x20000, 0x2A6DF, - 0x2A700, 0x2B739, - 0x2B740, 0x2B81D, - 0x2B820, 0x2CEA1, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, 0x2F800, 0x2FA1D, 0x30000, 0x3134A, - 0x31350, 0x323AF, + 0x31350, 0x33479, }; -#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1528 +#define UNICODE_ALNUM_CODEPOINTS_LENGTH 1598 static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEPOINTS_LENGTH] = { 0x100, 0x2C1, 0x2C6, 0x2D1, @@ -739,7 +773,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x2EC, 0x2EC, 0x2EE, 0x2EE, 0x345, 0x345, - 0x370, 0x374, + 0x363, 0x374, 0x376, 0x377, 0x37A, 0x37D, 0x37F, 0x37F, @@ -778,7 +812,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x840, 0x858, 0x860, 0x86A, 0x870, 0x887, - 0x889, 0x88E, + 0x889, 0x88F, + 0x897, 0x897, 0x8A0, 0x8C9, 0x8D4, 0x8DF, 0x8E3, 0x8E9, @@ -872,7 +907,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xC4A, 0xC4C, 0xC55, 0xC56, 0xC58, 0xC5A, - 0xC5D, 0xC5D, + 0xC5C, 0xC5D, 0xC60, 0xC63, 0xC66, 0xC6F, 0xC80, 0xC83, @@ -885,7 +920,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xCC6, 0xCC8, 0xCCA, 0xCCC, 0xCD5, 0xCD6, - 0xCDD, 0xCDE, + 0xCDC, 0xCDE, 0xCE0, 0xCE3, 0xCE6, 0xCEF, 0xCF1, 0xCF3, @@ -1007,7 +1042,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1C00, 0x1C36, 0x1C40, 0x1C49, 0x1C4D, 0x1C7D, - 0x1C80, 0x1C88, + 0x1C80, 0x1C8A, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1CE9, 0x1CEC, @@ -1015,7 +1050,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1CF5, 0x1CF6, 0x1CFA, 0x1CFA, 0x1D00, 0x1DBF, - 0x1DE7, 0x1DF4, + 0x1DD3, 0x1DF4, 0x1E00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, @@ -1094,11 +1129,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0xA67F, 0xA6EF, 0xA717, 0xA71F, 0xA722, 0xA788, - 0xA78B, 0xA7CA, - 0xA7D0, 0xA7D1, - 0xA7D3, 0xA7D3, - 0xA7D5, 0xA7D9, - 0xA7F2, 0xA805, + 0xA78B, 0xA7DC, + 0xA7F1, 0xA805, 0xA807, 0xA827, 0xA840, 0xA873, 0xA880, 0xA8C3, @@ -1191,6 +1223,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x105A3, 0x105B1, 0x105B3, 0x105B9, 0x105BB, 0x105BC, + 0x105C0, 0x105F3, 0x10600, 0x10736, 0x10740, 0x10755, 0x10760, 0x10767, @@ -1209,6 +1242,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x108F4, 0x108F5, 0x10900, 0x10915, 0x10920, 0x10939, + 0x10940, 0x10959, 0x10980, 0x109B7, 0x109BE, 0x109BF, 0x10A00, 0x10A03, @@ -1229,9 +1263,14 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x10CC0, 0x10CF2, 0x10D00, 0x10D27, 0x10D30, 0x10D39, + 0x10D40, 0x10D65, + 0x10D69, 0x10D69, + 0x10D6F, 0x10D85, 0x10E80, 0x10EA9, 0x10EAB, 0x10EAC, 0x10EB0, 0x10EB1, + 0x10EC2, 0x10EC7, + 0x10EFA, 0x10EFC, 0x10F00, 0x10F1C, 0x10F27, 0x10F27, 0x10F30, 0x10F45, @@ -1278,6 +1317,17 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11350, 0x11350, 0x11357, 0x11357, 0x1135D, 0x11363, + 0x11380, 0x11389, + 0x1138B, 0x1138B, + 0x1138E, 0x1138E, + 0x11390, 0x113B5, + 0x113B7, 0x113C0, + 0x113C2, 0x113C2, + 0x113C5, 0x113C5, + 0x113C7, 0x113CA, + 0x113CC, 0x113CD, + 0x113D1, 0x113D1, + 0x113D3, 0x113D3, 0x11400, 0x11441, 0x11443, 0x11445, 0x11447, 0x1144A, @@ -1297,6 +1347,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11680, 0x116B5, 0x116B8, 0x116B8, 0x116C0, 0x116C9, + 0x116D0, 0x116E3, 0x11700, 0x1171A, 0x1171D, 0x1172A, 0x11730, 0x11739, @@ -1322,6 +1373,9 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11A50, 0x11A97, 0x11A9D, 0x11A9D, 0x11AB0, 0x11AF8, + 0x11B60, 0x11B67, + 0x11BC0, 0x11BE0, + 0x11BF0, 0x11BF9, 0x11C00, 0x11C08, 0x11C0A, 0x11C36, 0x11C38, 0x11C3E, @@ -1346,6 +1400,8 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x11D93, 0x11D96, 0x11D98, 0x11D98, 0x11DA0, 0x11DA9, + 0x11DB0, 0x11DDB, + 0x11DE0, 0x11DE9, 0x11EE0, 0x11EF6, 0x11F00, 0x11F10, 0x11F12, 0x11F3A, @@ -1358,7 +1414,10 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x12F90, 0x12FF0, 0x13000, 0x1342F, 0x13441, 0x13446, + 0x13460, 0x143FA, 0x14400, 0x14646, + 0x16100, 0x1612E, + 0x16130, 0x16139, 0x16800, 0x16A38, 0x16A40, 0x16A5E, 0x16A60, 0x16A69, @@ -1370,16 +1429,20 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x16B50, 0x16B59, 0x16B63, 0x16B77, 0x16B7D, 0x16B8F, + 0x16D40, 0x16D6C, + 0x16D70, 0x16D79, 0x16E40, 0x16E7F, + 0x16EA0, 0x16EB8, + 0x16EBB, 0x16ED3, 0x16F00, 0x16F4A, 0x16F4F, 0x16F87, 0x16F8F, 0x16F9F, 0x16FE0, 0x16FE1, 0x16FE3, 0x16FE3, - 0x16FF0, 0x16FF1, - 0x17000, 0x187F7, - 0x18800, 0x18CD5, - 0x18D00, 0x18D08, + 0x16FF0, 0x16FF6, + 0x17000, 0x18CD5, + 0x18CFF, 0x18D1E, + 0x18D80, 0x18DF2, 0x1AFF0, 0x1AFF3, 0x1AFF5, 0x1AFFB, 0x1AFFD, 0x1AFFE, @@ -1394,6 +1457,7 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1BC80, 0x1BC88, 0x1BC90, 0x1BC99, 0x1BC9E, 0x1BC9E, + 0x1CCF0, 0x1CCF9, 0x1D400, 0x1D454, 0x1D456, 0x1D49C, 0x1D49E, 0x1D49F, @@ -1443,6 +1507,11 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1E2F0, 0x1E2F9, 0x1E4D0, 0x1E4EB, 0x1E4F0, 0x1E4F9, + 0x1E5D0, 0x1E5ED, + 0x1E5F0, 0x1E5FA, + 0x1E6C0, 0x1E6DE, + 0x1E6E0, 0x1E6F5, + 0x1E6FE, 0x1E6FF, 0x1E7E0, 0x1E7E6, 0x1E7E8, 0x1E7EB, 0x1E7ED, 0x1E7EE, @@ -1490,16 +1559,16 @@ static const pm_unicode_codepoint_t unicode_alnum_codepoints[UNICODE_ALNUM_CODEP 0x1F170, 0x1F189, 0x1FBF0, 0x1FBF9, 0x20000, 0x2A6DF, - 0x2A700, 0x2B739, - 0x2B740, 0x2B81D, - 0x2B820, 0x2CEA1, + 0x2A700, 0x2B81D, + 0x2B820, 0x2CEAD, 0x2CEB0, 0x2EBE0, + 0x2EBF0, 0x2EE5D, 0x2F800, 0x2FA1D, 0x30000, 0x3134A, - 0x31350, 0x323AF, + 0x31350, 0x33479, }; -#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1302 +#define UNICODE_ISUPPER_CODEPOINTS_LENGTH 1320 static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_CODEPOINTS_LENGTH] = { 0x100, 0x100, 0x102, 0x102, @@ -1774,6 +1843,7 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0x10C7, 0x10C7, 0x10CD, 0x10CD, 0x13A0, 0x13F5, + 0x1C89, 0x1C89, 0x1C90, 0x1CBA, 0x1CBD, 0x1CBF, 0x1E00, 0x1E00, @@ -2103,9 +2173,15 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0xA7C2, 0xA7C2, 0xA7C4, 0xA7C7, 0xA7C9, 0xA7C9, + 0xA7CB, 0xA7CC, + 0xA7CE, 0xA7CE, 0xA7D0, 0xA7D0, + 0xA7D2, 0xA7D2, + 0xA7D4, 0xA7D4, 0xA7D6, 0xA7D6, 0xA7D8, 0xA7D8, + 0xA7DA, 0xA7DA, + 0xA7DC, 0xA7DC, 0xA7F5, 0xA7F5, 0xFF21, 0xFF3A, 0x10400, 0x10427, @@ -2115,8 +2191,10 @@ static const pm_unicode_codepoint_t unicode_isupper_codepoints[UNICODE_ISUPPER_C 0x1058C, 0x10592, 0x10594, 0x10595, 0x10C80, 0x10CB2, + 0x10D50, 0x10D65, 0x118A0, 0x118BF, 0x16E40, 0x16E5F, + 0x16EA0, 0x16EB8, 0x1D400, 0x1D419, 0x1D434, 0x1D44D, 0x1D468, 0x1D481, @@ -2304,6 +2382,10 @@ pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n) { */ size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; } @@ -2324,6 +2406,10 @@ pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n) { */ size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } @@ -2344,6 +2430,10 @@ pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n) { */ bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; } @@ -2362,7 +2452,8 @@ pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n) { static pm_unicode_codepoint_t pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { - if (b[0] < 0x80) { + + if ((n > 0) && (b[0] < 0x80)) { *width = 1; return (pm_unicode_codepoint_t) b[0]; } @@ -2401,6 +2492,10 @@ pm_cesu_8_codepoint(const uint8_t *b, ptrdiff_t n, size_t *width) { static size_t pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + size_t width; pm_cesu_8_codepoint(b, n, &width); return width; @@ -2408,6 +2503,10 @@ pm_encoding_cesu_8_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) ? 1 : 0; } @@ -2424,6 +2523,10 @@ pm_encoding_cesu_8_alpha_char(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & (PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } @@ -2440,6 +2543,10 @@ pm_encoding_cesu_8_alnum_char(const uint8_t *b, ptrdiff_t n) { static bool pm_encoding_cesu_8_isupper_char(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } + if (*b < 0x80) { return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_UPPERCASE_BIT) ? true : false; } @@ -3855,14 +3962,14 @@ static const uint8_t pm_encoding_windows_874_table[256] = { }; #define PRISM_ENCODING_TABLE(name) \ - static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); \ + static size_t pm_encoding_ ##name ## _alpha_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHABETIC_BIT)); \ } \ - static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; \ + static size_t pm_encoding_ ##name ## _alnum_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; \ } \ - static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { \ - return (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT); \ + static bool pm_encoding_ ##name ## _isupper_char(const uint8_t *b, ptrdiff_t n) { \ + return ((n > 0) && (pm_encoding_ ##name ## _table[*b] & PRISM_ENCODING_UPPERCASE_BIT)); \ } PRISM_ENCODING_TABLE(cp850) @@ -3931,8 +4038,8 @@ PRISM_ENCODING_TABLE(windows_874) * means that if the top bit is not set, the character is 1 byte long. */ static size_t -pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return *b < 0x80 ? 1 : 0; +pm_encoding_ascii_char_width(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (*b < 0x80)) ? 1 : 0; } /** @@ -3940,8 +4047,8 @@ pm_encoding_ascii_char_width(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t * alphabetical character. */ static size_t -pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT); +pm_encoding_ascii_alpha_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) ? (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHABETIC_BIT) : 0; } /** @@ -3951,7 +4058,7 @@ pm_encoding_ascii_alpha_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t */ static size_t pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) ? pm_encoding_ascii_alpha_char(b, n) : 0; + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alpha_char(b, n) : 0; } /** @@ -3959,8 +4066,8 @@ pm_encoding_ascii_alpha_char_7bit(const uint8_t *b, ptrdiff_t n) { * alphanumeric character. */ static size_t -pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT) ? 1 : 0; +pm_encoding_ascii_alnum_char(const uint8_t *b, ptrdiff_t n) { + return ((n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT)) ? 1 : 0; } /** @@ -3970,7 +4077,7 @@ pm_encoding_ascii_alnum_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t */ static size_t pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) ? pm_encoding_ascii_alnum_char(b, n) : 0; + return ((n > 0) && (*b < 0x80)) ? pm_encoding_ascii_alnum_char(b, n) : 0; } /** @@ -3978,8 +4085,8 @@ pm_encoding_ascii_alnum_char_7bit(const uint8_t *b, ptrdiff_t n) { * character. */ static bool -pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { - return (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT); +pm_encoding_ascii_isupper_char(const uint8_t *b, ptrdiff_t n) { + return (n > 0) && (pm_encoding_ascii_table[*b] & PRISM_ENCODING_UPPERCASE_BIT); } /** @@ -3987,7 +4094,7 @@ pm_encoding_ascii_isupper_char(const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_ * matter what the codepoint, so this function is shared between them. */ static size_t -pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATTRIBUTE_UNUSED ptrdiff_t n) { +pm_encoding_single_char_width(PRISM_UNUSED const uint8_t *b, PRISM_UNUSED ptrdiff_t n) { return 1; } @@ -3998,7 +4105,7 @@ pm_encoding_single_char_width(PRISM_ATTRIBUTE_UNUSED const uint8_t *b, PRISM_ATT static size_t pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4042,6 +4149,9 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { */ static size_t pm_encoding_shift_jis_char_width(const uint8_t *b, ptrdiff_t n) { + if (n == 0) { + return 0; + } // These are the single byte characters. if (b[0] < 0x80 || (b[0] >= 0xA1 && b[0] <= 0xDF)) { return 1; @@ -4105,7 +4215,7 @@ pm_encoding_shift_jis_isupper_char(const uint8_t *b, ptrdiff_t n) { */ static bool pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { - return (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); + return (n > 0) && (*b < 0x80) && pm_encoding_ascii_isupper_char(b, n); } /** @@ -4115,7 +4225,7 @@ pm_encoding_ascii_isupper_char_7bit(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4134,7 +4244,7 @@ pm_encoding_big5_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters - if (*b <= 0x80) { + if ((n > 0) && (*b <= 0x80)) { return 1; } @@ -4153,7 +4263,7 @@ pm_encoding_cp949_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { // These are the 1 byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4196,7 +4306,7 @@ pm_encoding_emacs_mule_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4215,7 +4325,7 @@ pm_encoding_euc_kr_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4239,7 +4349,7 @@ pm_encoding_euc_tw_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { // These are the 1 byte characters. - if (*b < 0x80) { + if ((n > 0) && (*b < 0x80)) { return 1; } @@ -4263,7 +4373,7 @@ pm_encoding_gb18030_char_width(const uint8_t *b, ptrdiff_t n) { static size_t pm_encoding_gbk_char_width(const uint8_t *b, ptrdiff_t n) { // These are the single byte characters. - if (*b <= 0x80) { + if ((n > 0) && (*b <= 0x80)) { return 1; } diff --git a/prism/excludes.h b/prism/excludes.h new file mode 100644 index 0000000000..8600622f63 --- /dev/null +++ b/prism/excludes.h @@ -0,0 +1,29 @@ +/** + * @file excludes.h + * + * A header file that defines macros to exclude certain features of the prism + * library. This is useful for reducing the size of the library when certain + * features are not needed. + */ +#ifndef PRISM_EXCLUDES_H +#define PRISM_EXCLUDES_H + +/** + * If PRISM_BUILD_MINIMAL is defined, then we're going to define every possible + * switch that will turn off certain features of prism. + */ +#ifdef PRISM_BUILD_MINIMAL + /** Exclude the serialization API. */ + #define PRISM_EXCLUDE_SERIALIZATION + + /** Exclude the JSON serialization API. */ + #define PRISM_EXCLUDE_JSON + + /** Exclude the prettyprint API. */ + #define PRISM_EXCLUDE_PRETTYPRINT + + /** Exclude the full set of encodings, using the minimal only. */ + #define PRISM_ENCODING_EXCLUDE_FULL +#endif + +#endif diff --git a/prism/extension.c b/prism/extension.c index 1533ca7bb3..27df8dac50 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -4,6 +4,8 @@ #include <ruby/win32.h> #endif +#include <errno.h> + // NOTE: this file should contain only bindings. All non-trivial logic should be // in libprism so it can be shared its the various callers. @@ -25,6 +27,7 @@ VALUE rb_cPrismLexResult; VALUE rb_cPrismParseLexResult; VALUE rb_cPrismStringQuery; VALUE rb_cPrismScope; +VALUE rb_cPrismCurrentVersionError; VALUE rb_cPrismDebugEncoding; @@ -63,18 +66,6 @@ check_string(VALUE value) { return RSTRING_PTR(value); } -/** - * Load the contents and size of the given string into the given pm_string_t. - */ -static void -input_load_string(pm_string_t *input, VALUE string) { - // Check if the string is a string. If it's not, then raise a type error. - if (!RB_TYPE_P(string, T_STRING)) { - rb_raise(rb_eTypeError, "wrong argument type %" PRIsVALUE " (expected String)", rb_obj_class(string)); - } - - pm_string_constant_init(input, RSTRING_PTR(string), RSTRING_LEN(string)); -} /******************************************************************************/ /* Building C options from Ruby options */ @@ -147,10 +138,8 @@ build_options_scopes(pm_options_t *options, VALUE scopes) { // Initialize the scope array. size_t locals_count = RARRAY_LEN(locals); - pm_options_scope_t *options_scope = &options->scopes[scope_index]; - if (!pm_options_scope_init(options_scope, locals_count)) { - rb_raise(rb_eNoMemError, "failed to allocate memory"); - } + pm_options_scope_t *options_scope = pm_options_scope_mut(options, scope_index); + pm_options_scope_init(options_scope, locals_count); // Iterate over the locals and add them to the scope. for (size_t local_index = 0; local_index < locals_count; local_index++) { @@ -163,7 +152,7 @@ build_options_scopes(pm_options_t *options, VALUE scopes) { } // Add the local to the scope. - pm_string_t *scope_local = &options_scope->locals[local_index]; + pm_string_t *scope_local = pm_options_scope_local_mut(options_scope, local_index); const char *name = rb_id2name(SYM2ID(local)); pm_string_constant_init(scope_local, name, strlen(name)); } @@ -199,7 +188,21 @@ build_options_i(VALUE key, VALUE value, VALUE argument) { if (!NIL_P(value)) { const char *version = check_string(value); - if (!pm_options_version_set(options, version, RSTRING_LEN(value))) { + if (RSTRING_LEN(value) == 7 && strncmp(version, "current", 7) == 0) { + if (!pm_options_version_set(options, ruby_version, 3)) { + rb_exc_raise(rb_exc_new_cstr(rb_cPrismCurrentVersionError, ruby_version)); + } + } else if (RSTRING_LEN(value) == 7 && strncmp(version, "nearest", 7) == 0) { + if (!pm_options_version_set(options, ruby_version, 3)) { + // Prism doesn't know this specific version. Is it lower? + if (ruby_version[0] < '3' || (ruby_version[0] == '3' && ruby_version[2] < '3')) { + pm_options_version_set_lowest(options); + } else { + // Must be higher. + pm_options_version_set_highest(options); + } + } + } else if (!pm_options_version_set(options, version, RSTRING_LEN(value))) { rb_raise(rb_eArgError, "invalid version: %" PRIsVALUE, value); } } @@ -263,7 +266,7 @@ build_options(VALUE argument) { */ static void extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) { - options->line = 1; // default + pm_options_line_set(options, 1); /* default */ if (!NIL_P(keywords)) { struct build_options_data data = { .options = options, .keywords = keywords }; @@ -291,36 +294,46 @@ extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) { /** * Read options for methods that look like (source, **options). */ -static void -string_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options) { +static VALUE +string_options(int argc, VALUE *argv, pm_options_t *options) { VALUE string; VALUE keywords; rb_scan_args(argc, argv, "1:", &string, &keywords); + if (!RB_TYPE_P(string, T_STRING)) { + pm_options_free(options); + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(string)); + } + extract_options(options, Qnil, keywords); - input_load_string(input, string); + return string; } /** * Read options for methods that look like (filepath, **options). */ -static void -file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, VALUE *encoded_filepath) { +static pm_source_t * +file_options(int argc, VALUE *argv, pm_options_t *options, VALUE *encoded_filepath) { VALUE filepath; VALUE keywords; rb_scan_args(argc, argv, "1:", &filepath, &keywords); - Check_Type(filepath, T_STRING); + if (!RB_TYPE_P(filepath, T_STRING)) { + pm_options_free(options); + rb_raise(rb_eTypeError, "wrong argument type %"PRIsVALUE" (expected String)", rb_obj_class(filepath)); + } + *encoded_filepath = rb_str_encode_ospath(filepath); extract_options(options, *encoded_filepath, keywords); - const char *source = (const char *) pm_string_source(&options->filepath); - pm_string_init_result_t result; + const char *source = (const char *) pm_string_source(pm_options_filepath(options)); + pm_source_init_result_t result; + pm_source_t *pm_src = pm_source_file_new(source, &result); - switch (result = pm_string_file_init(input, source)) { - case PM_STRING_INIT_SUCCESS: + switch (result) { + case PM_SOURCE_INIT_SUCCESS: break; - case PM_STRING_INIT_ERROR_GENERIC: { + case PM_SOURCE_INIT_ERROR_GENERIC: { pm_options_free(options); #ifdef _WIN32 @@ -332,7 +345,7 @@ file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, V rb_syserr_fail(e, source); break; } - case PM_STRING_INIT_ERROR_DIRECTORY: + case PM_SOURCE_INIT_ERROR_DIRECTORY: pm_options_free(options); rb_syserr_fail(EISDIR, source); break; @@ -341,6 +354,8 @@ file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, V rb_raise(rb_eRuntimeError, "Unknown error (%d) initializing file: %s", result, source); break; } + + return pm_src; } #ifndef PRISM_EXCLUDE_SERIALIZATION @@ -353,77 +368,82 @@ file_options(int argc, VALUE *argv, pm_string_t *input, pm_options_t *options, V * Dump the AST corresponding to the given input to a string. */ static VALUE -dump_input(pm_string_t *input, const pm_options_t *options) { - pm_buffer_t buffer; - if (!pm_buffer_init(&buffer)) { +dump_input(const uint8_t *input, size_t input_length, const pm_options_t *options) { + pm_buffer_t *buffer = pm_buffer_new(); + if (!buffer) { rb_raise(rb_eNoMemError, "failed to allocate memory"); } - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); - pm_node_t *node = pm_parse(&parser); - pm_serialize(&parser, node, &buffer); + pm_node_t *node = pm_parse(parser); + pm_serialize(parser, node, buffer); - VALUE result = rb_str_new(pm_buffer_value(&buffer), pm_buffer_length(&buffer)); - pm_node_destroy(&parser, node); - pm_buffer_free(&buffer); - pm_parser_free(&parser); + VALUE result = rb_str_new(pm_buffer_value(buffer), pm_buffer_length(buffer)); + pm_buffer_free(buffer); + pm_parser_free(parser); + pm_arena_free(arena); return result; } /** + * :markup: markdown * call-seq: - * Prism::dump(source, **options) -> String + * dump(source, **options) -> String * * Dump the AST corresponding to the given string to a string. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE dump(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); + + const uint8_t *source = (const uint8_t *) RSTRING_PTR(string); + size_t length = RSTRING_LEN(string); #ifdef PRISM_BUILD_DEBUG - size_t length = pm_string_length(&input); char* dup = xmalloc(length); - memcpy(dup, pm_string_source(&input), length); - pm_string_constant_init(&input, dup, length); + memcpy(dup, source, length); + source = (const uint8_t *) dup; #endif - VALUE value = dump_input(&input, &options); - if (options.freeze) rb_obj_freeze(value); + VALUE value = dump_input(source, length, options); + if (pm_options_freeze(options)) rb_obj_freeze(value); #ifdef PRISM_BUILD_DEBUG +#ifdef xfree_sized + xfree_sized(dup, length); +#else xfree(dup); #endif +#endif - pm_string_free(&input); - pm_options_free(&options); + pm_options_free(options); return value; } /** + * :markup: markdown * call-seq: - * Prism::dump_file(filepath, **options) -> String + * dump_file(filepath, **options) -> String * * Dump the AST corresponding to the given file to a string. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE dump_file(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE value = dump_input(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = dump_input(pm_source_source(src), pm_source_length(src), options); + pm_source_free(src); + pm_options_free(options); return value; } @@ -449,42 +469,49 @@ rb_class_new_instance_freeze(int argc, const VALUE *argv, VALUE klass, bool free * Create a new Location instance from the given parser and bounds. */ static inline VALUE -parser_location(const pm_parser_t *parser, VALUE source, bool freeze, const uint8_t *start, size_t length) { - VALUE argv[] = { source, LONG2FIX(start - parser->start), LONG2FIX(length) }; +parser_location(VALUE source, bool freeze, uint32_t start, uint32_t length) { + VALUE argv[] = { source, LONG2FIX(start), LONG2FIX(length) }; return rb_class_new_instance_freeze(3, argv, rb_cPrismLocation, freeze); } /** * Create a new Location instance from the given parser and location. */ -#define PARSER_LOCATION_LOC(parser, source, freeze, loc) \ - parser_location(parser, source, freeze, loc.start, (size_t) (loc.end - loc.start)) +#define PARSER_LOCATION(source, freeze, location) \ + parser_location(source, freeze, location.start, location.length) /** * Build a new Comment instance from the given parser and comment. */ static inline VALUE -parser_comment(const pm_parser_t *parser, VALUE source, bool freeze, const pm_comment_t *comment) { - VALUE argv[] = { PARSER_LOCATION_LOC(parser, source, freeze, comment->location) }; - VALUE type = (comment->type == PM_COMMENT_EMBDOC) ? rb_cPrismEmbDocComment : rb_cPrismInlineComment; +parser_comment(VALUE source, bool freeze, const pm_comment_t *comment) { + VALUE argv[] = { PARSER_LOCATION(source, freeze, pm_comment_location(comment)) }; + VALUE type = (pm_comment_type(comment) == PM_COMMENT_EMBDOC) ? rb_cPrismEmbDocComment : rb_cPrismInlineComment; return rb_class_new_instance_freeze(1, argv, type, freeze); } +typedef struct { + VALUE comments; + VALUE source; + bool freeze; +} parser_comments_each_data_t; + +static void +parser_comments_each(const pm_comment_t *comment, void *data) { + parser_comments_each_data_t *each_data = (parser_comments_each_data_t *) data; + VALUE value = parser_comment(each_data->source, each_data->freeze, comment); + rb_ary_push(each_data->comments, value); +} + /** * Extract the comments out of the parser into an array. */ static VALUE parser_comments(const pm_parser_t *parser, VALUE source, bool freeze) { - VALUE comments = rb_ary_new_capa(parser->comment_list.size); - - for ( - const pm_comment_t *comment = (const pm_comment_t *) parser->comment_list.head; - comment != NULL; - comment = (const pm_comment_t *) comment->node.next - ) { - VALUE value = parser_comment(parser, source, freeze, comment); - rb_ary_push(comments, value); - } + VALUE comments = rb_ary_new_capa(pm_parser_comments_size(parser)); + + parser_comments_each_data_t each_data = { comments, source, freeze }; + pm_parser_comments_each(parser, parser_comments_each, &each_data); if (freeze) rb_obj_freeze(comments); return comments; @@ -494,28 +521,39 @@ parser_comments(const pm_parser_t *parser, VALUE source, bool freeze) { * Build a new MagicComment instance from the given parser and magic comment. */ static inline VALUE -parser_magic_comment(const pm_parser_t *parser, VALUE source, bool freeze, const pm_magic_comment_t *magic_comment) { - VALUE key_loc = parser_location(parser, source, freeze, magic_comment->key_start, magic_comment->key_length); - VALUE value_loc = parser_location(parser, source, freeze, magic_comment->value_start, magic_comment->value_length); +parser_magic_comment(VALUE source, bool freeze, const pm_magic_comment_t *magic_comment) { + pm_location_t key = pm_magic_comment_key(magic_comment); + pm_location_t value = pm_magic_comment_value(magic_comment); + + VALUE key_loc = parser_location(source, freeze, key.start, key.length); + VALUE value_loc = parser_location(source, freeze, value.start, value.length); + VALUE argv[] = { key_loc, value_loc }; return rb_class_new_instance_freeze(2, argv, rb_cPrismMagicComment, freeze); } +typedef struct { + VALUE magic_comments; + VALUE source; + bool freeze; +} parser_magic_comments_each_data_t; + +static void +parser_magic_comments_each(const pm_magic_comment_t *magic_comment, void *data) { + parser_magic_comments_each_data_t *each_data = (parser_magic_comments_each_data_t *) data; + VALUE value = parser_magic_comment(each_data->source, each_data->freeze, magic_comment); + rb_ary_push(each_data->magic_comments, value); +} + /** * Extract the magic comments out of the parser into an array. */ static VALUE parser_magic_comments(const pm_parser_t *parser, VALUE source, bool freeze) { - VALUE magic_comments = rb_ary_new_capa(parser->magic_comment_list.size); - - for ( - const pm_magic_comment_t *magic_comment = (const pm_magic_comment_t *) parser->magic_comment_list.head; - magic_comment != NULL; - magic_comment = (const pm_magic_comment_t *) magic_comment->node.next - ) { - VALUE value = parser_magic_comment(parser, source, freeze, magic_comment); - rb_ary_push(magic_comments, value); - } + VALUE magic_comments = rb_ary_new_capa(pm_parser_magic_comments_size(parser)); + + parser_magic_comments_each_data_t each_data = { magic_comments, source, freeze }; + pm_parser_magic_comments_each(parser, parser_magic_comments_each, &each_data); if (freeze) rb_obj_freeze(magic_comments); return magic_comments; @@ -527,85 +565,109 @@ parser_magic_comments(const pm_parser_t *parser, VALUE source, bool freeze) { */ static VALUE parser_data_loc(const pm_parser_t *parser, VALUE source, bool freeze) { - if (parser->data_loc.end == NULL) { + const pm_location_t *data_loc = pm_parser_data_loc(parser); + + if (data_loc->length == 0) { return Qnil; } else { - return PARSER_LOCATION_LOC(parser, source, freeze, parser->data_loc); + return parser_location(source, freeze, data_loc->start, data_loc->length); } } +typedef struct { + VALUE errors; + rb_encoding *encoding; + VALUE source; + bool freeze; +} parser_errors_each_data_t; + +static void +parser_errors_each(const pm_diagnostic_t *diagnostic, void *data) { + parser_errors_each_data_t *each_data = (parser_errors_each_data_t *) data; + + VALUE type = ID2SYM(rb_intern(pm_diagnostic_type(diagnostic))); + VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(pm_diagnostic_message(diagnostic), each_data->encoding)); + VALUE location = PARSER_LOCATION(each_data->source, each_data->freeze, pm_diagnostic_location(diagnostic)); + + pm_error_level_t error_level = pm_diagnostic_error_level(diagnostic); + VALUE level = Qnil; + + switch (error_level) { + case PM_ERROR_LEVEL_SYNTAX: + level = ID2SYM(rb_intern("syntax")); + break; + case PM_ERROR_LEVEL_ARGUMENT: + level = ID2SYM(rb_intern("argument")); + break; + case PM_ERROR_LEVEL_LOAD: + level = ID2SYM(rb_intern("load")); + break; + default: + rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, error_level); + } + + VALUE argv[] = { type, message, location, level }; + VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseError, each_data->freeze); + rb_ary_push(each_data->errors, value); +} + /** * Extract the errors out of the parser into an array. */ static VALUE parser_errors(const pm_parser_t *parser, rb_encoding *encoding, VALUE source, bool freeze) { - VALUE errors = rb_ary_new_capa(parser->error_list.size); - - for ( - const pm_diagnostic_t *error = (const pm_diagnostic_t *) parser->error_list.head; - error != NULL; - error = (const pm_diagnostic_t *) error->node.next - ) { - VALUE type = ID2SYM(rb_intern(pm_diagnostic_id_human(error->diag_id))); - VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(error->message, encoding)); - VALUE location = PARSER_LOCATION_LOC(parser, source, freeze, error->location); - - VALUE level = Qnil; - switch (error->level) { - case PM_ERROR_LEVEL_SYNTAX: - level = ID2SYM(rb_intern("syntax")); - break; - case PM_ERROR_LEVEL_ARGUMENT: - level = ID2SYM(rb_intern("argument")); - break; - case PM_ERROR_LEVEL_LOAD: - level = ID2SYM(rb_intern("load")); - break; - default: - rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, error->level); - } + VALUE errors = rb_ary_new_capa(pm_parser_errors_size(parser)); - VALUE argv[] = { type, message, location, level }; - VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseError, freeze); - rb_ary_push(errors, value); - } + parser_errors_each_data_t each_data = { errors, encoding, source, freeze }; + pm_parser_errors_each(parser, parser_errors_each, &each_data); if (freeze) rb_obj_freeze(errors); return errors; } +typedef struct { + VALUE warnings; + rb_encoding *encoding; + VALUE source; + bool freeze; +} parser_warnings_each_data_t; + +static void +parser_warnings_each(const pm_diagnostic_t *diagnostic, void *data) { + parser_warnings_each_data_t *each_data = (parser_warnings_each_data_t *) data; + + VALUE type = ID2SYM(rb_intern(pm_diagnostic_type(diagnostic))); + VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(pm_diagnostic_message(diagnostic), each_data->encoding)); + VALUE location = PARSER_LOCATION(each_data->source, each_data->freeze, pm_diagnostic_location(diagnostic)); + + pm_warning_level_t warning_level = pm_diagnostic_warning_level(diagnostic); + VALUE level = Qnil; + + switch (warning_level) { + case PM_WARNING_LEVEL_DEFAULT: + level = ID2SYM(rb_intern("default")); + break; + case PM_WARNING_LEVEL_VERBOSE: + level = ID2SYM(rb_intern("verbose")); + break; + default: + rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, warning_level); + } + + VALUE argv[] = { type, message, location, level }; + VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseWarning, each_data->freeze); + rb_ary_push(each_data->warnings, value); +} + /** * Extract the warnings out of the parser into an array. */ static VALUE parser_warnings(const pm_parser_t *parser, rb_encoding *encoding, VALUE source, bool freeze) { - VALUE warnings = rb_ary_new_capa(parser->warning_list.size); - - for ( - const pm_diagnostic_t *warning = (const pm_diagnostic_t *) parser->warning_list.head; - warning != NULL; - warning = (const pm_diagnostic_t *) warning->node.next - ) { - VALUE type = ID2SYM(rb_intern(pm_diagnostic_id_human(warning->diag_id))); - VALUE message = rb_obj_freeze(rb_enc_str_new_cstr(warning->message, encoding)); - VALUE location = PARSER_LOCATION_LOC(parser, source, freeze, warning->location); - - VALUE level = Qnil; - switch (warning->level) { - case PM_WARNING_LEVEL_DEFAULT: - level = ID2SYM(rb_intern("default")); - break; - case PM_WARNING_LEVEL_VERBOSE: - level = ID2SYM(rb_intern("verbose")); - break; - default: - rb_raise(rb_eRuntimeError, "Unknown level: %" PRIu8, warning->level); - } + VALUE warnings = rb_ary_new_capa(pm_parser_warnings_size(parser)); - VALUE argv[] = { type, message, location, level }; - VALUE value = rb_class_new_instance_freeze(4, argv, rb_cPrismParseWarning, freeze); - rb_ary_push(warnings, value); - } + parser_warnings_each_data_t each_data = { warnings, encoding, source, freeze }; + pm_parser_warnings_each(parser, parser_warnings_each, &each_data); if (freeze) rb_obj_freeze(warnings); return warnings; @@ -623,10 +685,11 @@ parse_result_create(VALUE class, const pm_parser_t *parser, VALUE value, rb_enco parser_data_loc(parser, source, freeze), parser_errors(parser, encoding, source, freeze), parser_warnings(parser, encoding, source, freeze), + pm_parser_continuable(parser) ? Qtrue : Qfalse, source }; - return rb_class_new_instance_freeze(7, result_argv, class, freeze); + return rb_class_new_instance_freeze(8, result_argv, class, freeze); } /******************************************************************************/ @@ -651,11 +714,11 @@ typedef struct { * onto the tokens array. */ static void -parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) { - parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; +parse_lex_token(pm_parser_t *parser, pm_token_t *token, void *data) { + parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) data; VALUE value = pm_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source, parse_lex_data->freeze); - VALUE yields = rb_assoc_new(value, INT2FIX(parser->lex_state)); + VALUE yields = rb_assoc_new(value, INT2FIX(pm_parser_lex_state(parser))); if (parse_lex_data->freeze) { rb_obj_freeze(value); @@ -672,8 +735,8 @@ parse_lex_token(void *data, pm_parser_t *parser, pm_token_t *token) { */ static void parse_lex_encoding_changed_callback(pm_parser_t *parser) { - parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data; - parse_lex_data->encoding = rb_enc_find(parser->encoding->name); + parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) pm_parser_lex_callback_data(parser); + parse_lex_data->encoding = rb_enc_find(pm_parser_encoding_name(parser)); // Since the encoding changed, we need to go back and change the encoding of // the tokens that were already lexed. This is only going to end up being @@ -718,43 +781,38 @@ parse_lex_encoding_changed_callback(pm_parser_t *parser) { * the nodes and tokens. */ static VALUE -parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nodes) { - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); - pm_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback); +parse_lex_input(const uint8_t *input, size_t input_length, const pm_options_t *options, bool return_nodes) { + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); + pm_parser_encoding_changed_callback_set(parser, parse_lex_encoding_changed_callback); - VALUE source_string = rb_str_new((const char *) pm_string_source(input), pm_string_length(input)); - VALUE offsets = rb_ary_new_capa(parser.newline_list.size); - VALUE source = rb_funcall(rb_cPrismSource, rb_id_source_for, 3, source_string, LONG2NUM(parser.start_line), offsets); + VALUE source_string = rb_str_new((const char *) input, input_length); + VALUE offsets = rb_ary_new_capa(pm_parser_line_offsets(parser)->size); + VALUE source = rb_funcall(rb_cPrismSource, rb_id_source_for, 3, source_string, LONG2NUM(pm_parser_start_line(parser)), offsets); parse_lex_data_t parse_lex_data = { .source = source, .tokens = rb_ary_new(), - .encoding = rb_utf8_encoding(), - .freeze = options->freeze, + .encoding = rb_enc_find(pm_parser_encoding_name(parser)), + .freeze = pm_options_freeze(options), }; parse_lex_data_t *data = &parse_lex_data; - pm_lex_callback_t lex_callback = (pm_lex_callback_t) { - .data = (void *) data, - .callback = parse_lex_token, - }; + pm_parser_lex_callback_set(parser, parse_lex_token, data); - parser.lex_callback = &lex_callback; - pm_node_t *node = pm_parse(&parser); + pm_node_t *node = pm_parse(parser); - // Here we need to update the Source object to have the correct - // encoding for the source string and the correct newline offsets. - // We do it here because we've already created the Source object and given - // it over to all of the tokens, and both of these are only set after pm_parse(). - rb_encoding *encoding = rb_enc_find(parser.encoding->name); + /* Update the Source object with the correct encoding and line offsets, + * which are only available after pm_parse() completes. */ + rb_encoding *encoding = rb_enc_find(pm_parser_encoding_name(parser)); rb_enc_associate(source_string, encoding); - for (size_t index = 0; index < parser.newline_list.size; index++) { - rb_ary_push(offsets, ULONG2NUM(parser.newline_list.offsets[index])); + const pm_line_offset_list_t *line_offsets = pm_parser_line_offsets(parser); + for (size_t index = 0; index < line_offsets->size; index++) { + rb_ary_store(offsets, (long) index, ULONG2NUM(line_offsets->offsets[index])); } - if (options->freeze) { + if (pm_options_freeze(options)) { rb_obj_freeze(source_string); rb_obj_freeze(offsets); rb_obj_freeze(source); @@ -764,58 +822,57 @@ parse_lex_input(pm_string_t *input, const pm_options_t *options, bool return_nod VALUE result; if (return_nodes) { VALUE value = rb_ary_new_capa(2); - rb_ary_push(value, pm_ast_new(&parser, node, parse_lex_data.encoding, source, options->freeze)); + rb_ary_push(value, pm_ast_new(parser, node, parse_lex_data.encoding, source, pm_options_freeze(options))); rb_ary_push(value, parse_lex_data.tokens); - if (options->freeze) rb_obj_freeze(value); - result = parse_result_create(rb_cPrismParseLexResult, &parser, value, parse_lex_data.encoding, source, options->freeze); + if (pm_options_freeze(options)) rb_obj_freeze(value); + result = parse_result_create(rb_cPrismParseLexResult, parser, value, parse_lex_data.encoding, source, pm_options_freeze(options)); } else { - result = parse_result_create(rb_cPrismLexResult, &parser, parse_lex_data.tokens, parse_lex_data.encoding, source, options->freeze); + result = parse_result_create(rb_cPrismLexResult, parser, parse_lex_data.tokens, parse_lex_data.encoding, source, pm_options_freeze(options)); } - pm_node_destroy(&parser, node); - pm_parser_free(&parser); + pm_parser_free(parser); + pm_arena_free(arena); return result; } /** + * :markup: markdown * call-seq: - * Prism::lex(source, **options) -> LexResult + * lex(source, **options) -> LexResult * * Return a LexResult instance that contains an array of Token instances - * corresponding to the given string. For supported options, see Prism::parse. + * corresponding to the given string. For supported options, see Prism.parse. */ static VALUE lex(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); - VALUE result = parse_lex_input(&input, &options, false); - pm_string_free(&input); - pm_options_free(&options); + VALUE result = parse_lex_input((const uint8_t *) RSTRING_PTR(string), RSTRING_LEN(string), options, false); + pm_options_free(options); return result; } /** + * :markup: markdown * call-seq: - * Prism::lex_file(filepath, **options) -> LexResult + * lex_file(filepath, **options) -> LexResult * * Return a LexResult instance that contains an array of Token instances - * corresponding to the given file. For supported options, see Prism::parse. + * corresponding to the given file. For supported options, see Prism.parse. */ static VALUE lex_file(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE value = parse_lex_input(&input, &options, false); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = parse_lex_input(pm_source_source(src), pm_source_length(src), options, false); + pm_source_free(src); + pm_options_free(options); return value; } @@ -828,30 +885,32 @@ lex_file(int argc, VALUE *argv, VALUE self) { * Parse the given input and return a ParseResult instance. */ static VALUE -parse_input(pm_string_t *input, const pm_options_t *options) { - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); +parse_input(const uint8_t *input, size_t input_length, const pm_options_t *options) { + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); - pm_node_t *node = pm_parse(&parser); - rb_encoding *encoding = rb_enc_find(parser.encoding->name); + pm_node_t *node = pm_parse(parser); + rb_encoding *encoding = rb_enc_find(pm_parser_encoding_name(parser)); - VALUE source = pm_source_new(&parser, encoding, options->freeze); - VALUE value = pm_ast_new(&parser, node, encoding, source, options->freeze); - VALUE result = parse_result_create(rb_cPrismParseResult, &parser, value, encoding, source, options->freeze); + bool freeze = pm_options_freeze(options); + VALUE source = pm_source_new(parser, encoding, freeze); + VALUE value = pm_ast_new(parser, node, encoding, source, freeze); + VALUE result = parse_result_create(rb_cPrismParseResult, parser, value, encoding, source, freeze); - if (options->freeze) { + if (freeze) { rb_obj_freeze(source); } - pm_node_destroy(&parser, node); - pm_parser_free(&parser); + pm_parser_free(parser); + pm_arena_free(arena); return result; } /** + * :markup: markdown * call-seq: - * Prism::parse(source, **options) -> ParseResult + * parse(source, **options) -> ParseResult * * Parse the given string and return a ParseResult instance. The options that * are supported are: @@ -888,51 +947,57 @@ parse_input(pm_string_t *input, const pm_options_t *options) { * version of Ruby syntax (which you can trigger with `nil` or * `"latest"`). You may also restrict the syntax to a specific version of * Ruby, e.g., with `"3.3.0"`. To parse with the same syntax version that - * the current Ruby is running use `version: RUBY_VERSION`. Raises - * ArgumentError if the version is not currently supported by Prism. + * the current Ruby is running use `version: "current"`. To parse with the + * nearest version to the current Ruby that is running, use + * `version: "nearest"`. Raises ArgumentError if the version is not + * currently supported by Prism. */ static VALUE parse(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); + + const uint8_t *source = (const uint8_t *) RSTRING_PTR(string); + size_t length = RSTRING_LEN(string); #ifdef PRISM_BUILD_DEBUG - size_t length = pm_string_length(&input); char* dup = xmalloc(length); - memcpy(dup, pm_string_source(&input), length); - pm_string_constant_init(&input, dup, length); + memcpy(dup, source, length); + source = (const uint8_t *) dup; #endif - VALUE value = parse_input(&input, &options); + VALUE value = parse_input(source, length, options); #ifdef PRISM_BUILD_DEBUG +#ifdef xfree_sized + xfree_sized(dup, length); +#else xfree(dup); #endif +#endif - pm_string_free(&input); - pm_options_free(&options); + pm_options_free(options); return value; } /** + * :markup: markdown * call-seq: - * Prism::parse_file(filepath, **options) -> ParseResult + * parse_file(filepath, **options) -> ParseResult * * Parse the given file and return a ParseResult instance. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE parse_file(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE value = parse_input(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = parse_input(pm_source_source(src), pm_source_length(src), options); + pm_source_free(src); + pm_options_free(options); return value; } @@ -941,59 +1006,66 @@ parse_file(int argc, VALUE *argv, VALUE self) { * Parse the given input and return nothing. */ static void -profile_input(pm_string_t *input, const pm_options_t *options) { - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); +profile_input(const uint8_t *input, size_t input_length, const pm_options_t *options) { + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); - pm_node_t *node = pm_parse(&parser); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); + pm_parse(parser); + pm_parser_free(parser); + pm_arena_free(arena); } /** + * :markup: markdown * call-seq: - * Prism::profile(source, **options) -> nil + * profile(source, **options) -> nil * * Parse the given string and return nothing. This method is meant to allow * profilers to avoid the overhead of reifying the AST to Ruby. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE profile(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); - string_options(argc, argv, &input, &options); - profile_input(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + profile_input((const uint8_t *) RSTRING_PTR(string), RSTRING_LEN(string), options); + pm_options_free(options); return Qnil; } /** + * :markup: markdown * call-seq: - * Prism::profile_file(filepath, **options) -> nil + * profile_file(filepath, **options) -> nil * * Parse the given file and return nothing. This method is meant to allow * profilers to avoid the overhead of reifying the AST to Ruby. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE profile_file(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - profile_input(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + profile_input(pm_source_source(src), pm_source_length(src), options); + pm_source_free(src); + pm_options_free(options); return Qnil; } +static int +parse_stream_eof(void *stream) { + if (rb_funcall((VALUE) stream, rb_intern("eof?"), 0)) { + return 1; + } + return 0; +} + /** * An implementation of fgets that is suitable for use with Ruby IO objects. */ @@ -1016,11 +1088,12 @@ parse_stream_fgets(char *string, int size, void *stream) { } /** + * :markup: markdown * call-seq: - * Prism::parse_stream(stream, **options) -> ParseResult + * parse_stream(stream, **options) -> ParseResult * * Parse the given object that responds to `gets` and return a ParseResult - * instance. The options that are supported are the same as Prism::parse. + * instance. The options that are supported are the same as Prism.parse. */ static VALUE parse_stream(int argc, VALUE *argv, VALUE self) { @@ -1028,22 +1101,24 @@ parse_stream(int argc, VALUE *argv, VALUE self) { VALUE keywords; rb_scan_args(argc, argv, "1:", &stream, &keywords); - pm_options_t options = { 0 }; - extract_options(&options, Qnil, keywords); + pm_options_t *options = pm_options_new(); + extract_options(options, Qnil, keywords); - pm_parser_t parser; - pm_buffer_t buffer; + pm_source_t *src = pm_source_stream_new((void *) stream, parse_stream_fgets, parse_stream_eof); + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser; - pm_node_t *node = pm_parse_stream(&parser, &buffer, (void *) stream, parse_stream_fgets, &options); - rb_encoding *encoding = rb_enc_find(parser.encoding->name); + pm_node_t *node = pm_parse_stream(&parser, arena, src, options); + rb_encoding *encoding = rb_enc_find(pm_parser_encoding_name(parser)); - VALUE source = pm_source_new(&parser, encoding, options.freeze); - VALUE value = pm_ast_new(&parser, node, encoding, source, options.freeze); - VALUE result = parse_result_create(rb_cPrismParseResult, &parser, value, encoding, source, options.freeze); + VALUE source = pm_source_new(parser, encoding, pm_options_freeze(options)); + VALUE value = pm_ast_new(parser, node, encoding, source, pm_options_freeze(options)); + VALUE result = parse_result_create(rb_cPrismParseResult, parser, value, encoding, source, pm_options_freeze(options)); - pm_node_destroy(&parser, node); - pm_buffer_free(&buffer); - pm_parser_free(&parser); + pm_source_free(src); + pm_parser_free(parser); + pm_arena_free(arena); + pm_options_free(options); return result; } @@ -1052,116 +1127,114 @@ parse_stream(int argc, VALUE *argv, VALUE self) { * Parse the given input and return an array of Comment objects. */ static VALUE -parse_input_comments(pm_string_t *input, const pm_options_t *options) { - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); +parse_input_comments(const uint8_t *input, size_t input_length, const pm_options_t *options) { + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); - pm_node_t *node = pm_parse(&parser); - rb_encoding *encoding = rb_enc_find(parser.encoding->name); + pm_parse(parser); + rb_encoding *encoding = rb_enc_find(pm_parser_encoding_name(parser)); - VALUE source = pm_source_new(&parser, encoding, options->freeze); - VALUE comments = parser_comments(&parser, source, options->freeze); + VALUE source = pm_source_new(parser, encoding, pm_options_freeze(options)); + VALUE comments = parser_comments(parser, source, pm_options_freeze(options)); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); + pm_parser_free(parser); + pm_arena_free(arena); return comments; } /** + * :markup: markdown * call-seq: - * Prism::parse_comments(source, **options) -> Array + * parse_comments(source, **options) -> Array * * Parse the given string and return an array of Comment objects. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE parse_comments(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); - VALUE result = parse_input_comments(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE result = parse_input_comments((const uint8_t *) RSTRING_PTR(string), RSTRING_LEN(string), options); + pm_options_free(options); return result; } /** + * :markup: markdown * call-seq: - * Prism::parse_file_comments(filepath, **options) -> Array + * parse_file_comments(filepath, **options) -> Array * * Parse the given file and return an array of Comment objects. For supported - * options, see Prism::parse. + * options, see Prism.parse. */ static VALUE parse_file_comments(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE value = parse_input_comments(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = parse_input_comments(pm_source_source(src), pm_source_length(src), options); + pm_source_free(src); + pm_options_free(options); return value; } /** + * :markup: markdown * call-seq: - * Prism::parse_lex(source, **options) -> ParseLexResult + * parse_lex(source, **options) -> ParseLexResult * * Parse the given string and return a ParseLexResult instance that contains a * 2-element array, where the first element is the AST and the second element is * an array of Token instances. * * This API is only meant to be used in the case where you need both the AST and - * the tokens. If you only need one or the other, use either Prism::parse or - * Prism::lex. + * the tokens. If you only need one or the other, use either Prism.parse or + * Prism.lex. * - * For supported options, see Prism::parse. + * For supported options, see Prism.parse. */ static VALUE parse_lex(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); - VALUE value = parse_lex_input(&input, &options, true); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = parse_lex_input((const uint8_t *) RSTRING_PTR(string), RSTRING_LEN(string), options, true); + pm_options_free(options); return value; } /** + * :markup: markdown * call-seq: - * Prism::parse_lex_file(filepath, **options) -> ParseLexResult + * parse_lex_file(filepath, **options) -> ParseLexResult * * Parse the given file and return a ParseLexResult instance that contains a * 2-element array, where the first element is the AST and the second element is * an array of Token instances. * * This API is only meant to be used in the case where you need both the AST and - * the tokens. If you only need one or the other, use either Prism::parse_file - * or Prism::lex_file. + * the tokens. If you only need one or the other, use either Prism.parse_file + * or Prism.lex_file. * - * For supported options, see Prism::parse. + * For supported options, see Prism.parse. */ static VALUE parse_lex_file(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE value = parse_lex_input(&input, &options, true); - pm_string_free(&input); - pm_options_free(&options); + VALUE value = parse_lex_input(pm_source_source(src), pm_source_length(src), options, true); + pm_source_free(src); + pm_options_free(options); return value; } @@ -1170,45 +1243,45 @@ parse_lex_file(int argc, VALUE *argv, VALUE self) { * Parse the given input and return true if it parses without errors. */ static VALUE -parse_input_success_p(pm_string_t *input, const pm_options_t *options) { - pm_parser_t parser; - pm_parser_init(&parser, pm_string_source(input), pm_string_length(input), options); +parse_input_success_p(const uint8_t *input, size_t input_length, const pm_options_t *options) { + pm_arena_t *arena = pm_arena_new(); + pm_parser_t *parser = pm_parser_new(arena, input, input_length, options); - pm_node_t *node = pm_parse(&parser); - pm_node_destroy(&parser, node); + pm_parse(parser); - VALUE result = parser.error_list.size == 0 ? Qtrue : Qfalse; - pm_parser_free(&parser); + VALUE result = pm_parser_errors_size(parser) == 0 ? Qtrue : Qfalse; + pm_parser_free(parser); + pm_arena_free(arena); return result; } /** + * :markup: markdown * call-seq: - * Prism::parse_success?(source, **options) -> bool + * parse_success?(source, **options) -> bool * * Parse the given string and return true if it parses without errors. For - * supported options, see Prism::parse. + * supported options, see Prism.parse. */ static VALUE parse_success_p(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; - string_options(argc, argv, &input, &options); + pm_options_t *options = pm_options_new(); + VALUE string = string_options(argc, argv, options); - VALUE result = parse_input_success_p(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE result = parse_input_success_p((const uint8_t *) RSTRING_PTR(string), RSTRING_LEN(string), options); + pm_options_free(options); return result; } /** + * :markup: markdown * call-seq: - * Prism::parse_failure?(source, **options) -> bool + * parse_failure?(source, **options) -> bool * * Parse the given string and return true if it parses with errors. For - * supported options, see Prism::parse. + * supported options, see Prism.parse. */ static VALUE parse_failure_p(int argc, VALUE *argv, VALUE self) { @@ -1216,33 +1289,34 @@ parse_failure_p(int argc, VALUE *argv, VALUE self) { } /** + * :markup: markdown * call-seq: - * Prism::parse_file_success?(filepath, **options) -> bool + * parse_file_success?(filepath, **options) -> bool * * Parse the given file and return true if it parses without errors. For - * supported options, see Prism::parse. + * supported options, see Prism.parse. */ static VALUE parse_file_success_p(int argc, VALUE *argv, VALUE self) { - pm_string_t input; - pm_options_t options = { 0 }; + pm_options_t *options = pm_options_new(); VALUE encoded_filepath; - file_options(argc, argv, &input, &options, &encoded_filepath); + pm_source_t *src = file_options(argc, argv, options, &encoded_filepath); - VALUE result = parse_input_success_p(&input, &options); - pm_string_free(&input); - pm_options_free(&options); + VALUE result = parse_input_success_p(pm_source_source(src), pm_source_length(src), options); + pm_source_free(src); + pm_options_free(options); return result; } /** + * :markup: markdown * call-seq: - * Prism::parse_file_failure?(filepath, **options) -> bool + * parse_file_failure?(filepath, **options) -> bool * * Parse the given file and return true if it parses with errors. For - * supported options, see Prism::parse. + * supported options, see Prism.parse. */ static VALUE parse_file_failure_p(int argc, VALUE *argv, VALUE self) { @@ -1272,8 +1346,9 @@ string_query(pm_string_query_t result) { } /** + * :markup: markdown * call-seq: - * Prism::StringQuery::local?(string) -> bool + * local?(string) -> bool * * Returns true if the string constitutes a valid local variable name. Note that * this means the names that can be set through Binding#local_variable_set, not @@ -1286,8 +1361,9 @@ string_query_local_p(VALUE self, VALUE string) { } /** + * :markup: markdown * call-seq: - * Prism::StringQuery::constant?(string) -> bool + * constant?(string) -> bool * * Returns true if the string constitutes a valid constant name. Note that this * means the names that can be set through Module#const_set, not necessarily the @@ -1300,8 +1376,9 @@ string_query_constant_p(VALUE self, VALUE string) { } /** + * :markup: markdown * call-seq: - * Prism::StringQuery::method_name?(string) -> bool + * method_name?(string) -> bool * * Returns true if the string constitutes a valid method name. */ @@ -1356,6 +1433,8 @@ Init_prism(void) { rb_cPrismStringQuery = rb_define_class_under(rb_cPrism, "StringQuery", rb_cObject); rb_cPrismScope = rb_define_class_under(rb_cPrism, "Scope", rb_cObject); + rb_cPrismCurrentVersionError = rb_const_get(rb_cPrism, rb_intern("CurrentVersionError")); + // Intern all of the IDs eagerly that we support so that we don't have to do // it every time we parse. rb_id_option_command_line = rb_intern_const("command_line"); @@ -1407,5 +1486,4 @@ Init_prism(void) { // Next, initialize the other APIs. Init_prism_api_node(); - Init_prism_pack(); } diff --git a/prism/extension.h b/prism/extension.h index 506da2fd6f..d0cbc2ff53 100644 --- a/prism/extension.h +++ b/prism/extension.h @@ -1,10 +1,11 @@ #ifndef PRISM_EXT_NODE_H #define PRISM_EXT_NODE_H -#define EXPECTED_PRISM_VERSION "1.4.0" +#define EXPECTED_PRISM_VERSION "1.9.0" #include <ruby.h> #include <ruby/encoding.h> +#include <ruby/version.h> #include "prism.h" VALUE pm_source_new(const pm_parser_t *parser, rb_encoding *encoding, bool freeze); @@ -13,7 +14,6 @@ VALUE pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding * VALUE pm_integer_new(const pm_integer_t *integer); void Init_prism_api_node(void); -void Init_prism_pack(void); RUBY_FUNC_EXPORTED void Init_prism(void); #endif diff --git a/prism/util/pm_integer.c b/prism/integer.c index 4170ecc58d..1b69dbdceb 100644 --- a/prism/util/pm_integer.c +++ b/prism/integer.c @@ -1,4 +1,25 @@ -#include "prism/util/pm_integer.h" +#include "prism/internal/integer.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/buffer.h" + +#include <assert.h> +#include <inttypes.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> + +/** + * Free the internal memory of an integer. This memory will only be allocated if + * the integer exceeds the size of a single uint32_t. + */ +static void +pm_integer_free(pm_integer_t *integer) { + if (integer->values) { + xfree(integer->values); + } +} /** * Pull out the length and values from the integer, regardless of the form in @@ -374,7 +395,7 @@ pm_integer_convert_base(pm_integer_t *destination, const pm_integer_t *source, u } } - xfree(bigints); + xfree_sized(bigints, bigints_length * sizeof(pm_integer_t)); bigints = next_bigints; bigints_length = next_length; } @@ -383,7 +404,7 @@ pm_integer_convert_base(pm_integer_t *destination, const pm_integer_t *source, u destination->negative = source->negative; pm_integer_normalize(destination); - xfree(bigints); + xfree_sized(bigints, bigints_length * sizeof(pm_integer_t)); pm_integer_free(&base); } @@ -422,7 +443,7 @@ pm_integer_parse_powof2(pm_integer_t *integer, uint32_t base, const uint8_t *dig static void pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t digits_length) { const size_t batch = 9; - size_t length = (digits_length + batch - 1) / batch; + const size_t length = (digits_length + batch - 1) / batch; uint32_t *values = (uint32_t *) xcalloc(length, sizeof(uint32_t)); uint32_t value = 0; @@ -439,7 +460,7 @@ pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t di // Convert base from 10**9 to 1<<32. pm_integer_convert_base(integer, &((pm_integer_t) { .length = length, .values = values, .value = 0, .negative = false }), 1000000000, ((uint64_t) 1 << 32)); - xfree(values); + xfree_sized(values, length * sizeof(uint32_t)); } /** @@ -448,7 +469,8 @@ pm_integer_parse_decimal(pm_integer_t *integer, const uint8_t *digits, size_t di static void pm_integer_parse_big(pm_integer_t *integer, uint32_t multiplier, const uint8_t *start, const uint8_t *end) { // Allocate an array to store digits. - uint8_t *digits = xmalloc(sizeof(uint8_t) * (size_t) (end - start)); + const size_t digits_capa = sizeof(uint8_t) * (size_t) (end - start); + uint8_t *digits = xmalloc(digits_capa); size_t digits_length = 0; for (; start < end; start++) { @@ -463,7 +485,7 @@ pm_integer_parse_big(pm_integer_t *integer, uint32_t multiplier, const uint8_t * pm_integer_parse_powof2(integer, multiplier, digits, digits_length); } - xfree(digits); + xfree_sized(digits, digits_capa); } /** @@ -603,7 +625,7 @@ void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator) { /** * Convert an integer to a decimal string. */ -PRISM_EXPORTED_FUNCTION void +void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) { if (integer->negative) { pm_buffer_append_byte(buffer, '-'); @@ -635,7 +657,7 @@ pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) { } // Allocate a buffer that we'll copy the decimal digits into. - size_t digits_length = converted.length * 9; + const size_t digits_length = converted.length * 9; char *digits = xcalloc(digits_length, sizeof(char)); if (digits == NULL) return; @@ -654,17 +676,6 @@ pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer) { // Finally, append the string to the buffer and free the digits. pm_buffer_append_string(buffer, digits + start_offset, digits_length - start_offset); - xfree(digits); + xfree_sized(digits, sizeof(char) * digits_length); pm_integer_free(&converted); } - -/** - * Free the internal memory of an integer. This memory will only be allocated if - * the integer exceeds the size of a single uint32_t. - */ -PRISM_EXPORTED_FUNCTION void -pm_integer_free(pm_integer_t *integer) { - if (integer->values) { - xfree(integer->values); - } -} diff --git a/prism/integer.h b/prism/integer.h new file mode 100644 index 0000000000..9285986885 --- /dev/null +++ b/prism/integer.h @@ -0,0 +1,41 @@ +/** + * @file integer.h + * + * This module provides functions for working with arbitrary-sized integers. + */ +#ifndef PRISM_INTEGER_H +#define PRISM_INTEGER_H + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/** + * A structure represents an arbitrary-sized integer. + */ +typedef struct { + /** + * The number of allocated values. length is set to 0 if the integer fits + * into uint32_t. + */ + size_t length; + + /** + * List of 32-bit integers. Set to NULL if the integer fits into uint32_t. + */ + uint32_t *values; + + /** + * Embedded value for small integer. This value is set to 0 if the value + * does not fit into uint32_t. + */ + uint32_t value; + + /** + * Whether or not the integer is negative. It is stored this way so that a + * zeroed pm_integer_t is always positive zero. + */ + bool negative; +} pm_integer_t; + +#endif diff --git a/prism/internal/allocator.h b/prism/internal/allocator.h new file mode 100644 index 0000000000..6c54010dbf --- /dev/null +++ b/prism/internal/allocator.h @@ -0,0 +1,68 @@ +#ifndef PRISM_INTERNAL_ALLOCATOR_H +#define PRISM_INTERNAL_ALLOCATOR_H + +/* If you build Prism with a custom allocator, configure it with + * "-D PRISM_XALLOCATOR" to use your own allocator that defines xmalloc, + * xrealloc, xcalloc, and xfree. + * + * For example, your `prism_xallocator.h` file could look like this: + * + * ``` + * #ifndef PRISM_XALLOCATOR_H + * #define PRISM_XALLOCATOR_H + * #define xmalloc my_malloc + * #define xrealloc my_realloc + * #define xcalloc my_calloc + * #define xfree my_free + * #define xrealloc_sized my_realloc_sized // (optional) + * #define xfree_sized my_free_sized // (optional) + * #endif + * ``` + */ +#ifdef PRISM_XALLOCATOR + #include "prism_xallocator.h" +#else + #ifndef xmalloc + /* The malloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xmalloc malloc + #endif + + #ifndef xrealloc + /* The realloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xrealloc realloc + #endif + + #ifndef xcalloc + /* The calloc function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xcalloc calloc + #endif + + #ifndef xfree + /* The free function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. */ + #define xfree free + #endif +#endif + +#ifndef xfree_sized + /* The free_sized function that should be used. This can be overridden with + * the PRISM_XALLOCATOR define. If not defined, defaults to calling xfree. + */ + #define xfree_sized(p, s) xfree(((void)(s), (p))) +#endif + +#ifndef xrealloc_sized + /* The xrealloc_sized function that should be used. This can be overridden + * with the PRISM_XALLOCATOR define. If not defined, defaults to calling + * xrealloc. */ + #define xrealloc_sized(p, ns, os) xrealloc((p), ((void)(os), (ns))) +#endif + +#ifdef PRISM_BUILD_DEBUG + #include "prism/internal/allocator_debug.h" +#endif + +#endif diff --git a/prism/internal/allocator_debug.h b/prism/internal/allocator_debug.h new file mode 100644 index 0000000000..846e96ba2d --- /dev/null +++ b/prism/internal/allocator_debug.h @@ -0,0 +1,88 @@ +#ifndef PRISM_INTERNAL_ALLOCATOR_DEBUG_H +#define PRISM_INTERNAL_ALLOCATOR_DEBUG_H + +#include <string.h> +#include <stdio.h> +#include <stdlib.h> + +static inline void * +pm_allocator_debug_malloc(size_t size) { + size_t *memory = xmalloc(size + sizeof(size_t)); + memory[0] = size; + return memory + 1; +} + +static inline void * +pm_allocator_debug_calloc(size_t nmemb, size_t size) { + size_t total_size = nmemb * size; + void *ptr = pm_allocator_debug_malloc(total_size); + memset(ptr, 0, total_size); + return ptr; +} + +static inline void * +pm_allocator_debug_realloc(void *ptr, size_t size) { + if (ptr == NULL) { + return pm_allocator_debug_malloc(size); + } + + size_t *memory = (size_t *)ptr; + void *raw_memory = memory - 1; + memory = (size_t *)xrealloc(raw_memory, size + sizeof(size_t)); + memory[0] = size; + return memory + 1; +} + +static inline void +pm_allocator_debug_free(void *ptr) { + if (ptr != NULL) { + size_t *memory = (size_t *)ptr; + xfree(memory - 1); + } +} + +static inline void +pm_allocator_debug_free_sized(void *ptr, size_t old_size) { + if (ptr != NULL) { + size_t *memory = (size_t *)ptr; + if (old_size != memory[-1]) { + fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but freed with size %lu\n", ptr, memory[-1], old_size); + abort(); + } + xfree_sized(memory - 1, old_size + sizeof(size_t)); + } +} + +static inline void * +pm_allocator_debug_realloc_sized(void *ptr, size_t size, size_t old_size) { + if (ptr == NULL) { + if (old_size != 0) { + fprintf(stderr, "[BUG] realloc_sized called with NULL pointer and old size %lu\n", old_size); + abort(); + } + return pm_allocator_debug_malloc(size); + } + + size_t *memory = (size_t *)ptr; + if (old_size != memory[-1]) { + fprintf(stderr, "[BUG] buffer %p was allocated with size %lu but realloced with size %lu\n", ptr, memory[-1], old_size); + abort(); + } + return pm_allocator_debug_realloc(ptr, size); +} + +#undef xmalloc +#undef xrealloc +#undef xcalloc +#undef xfree +#undef xrealloc_sized +#undef xfree_sized + +#define xmalloc pm_allocator_debug_malloc +#define xrealloc pm_allocator_debug_realloc +#define xcalloc pm_allocator_debug_calloc +#define xfree pm_allocator_debug_free +#define xrealloc_sized pm_allocator_debug_realloc_sized +#define xfree_sized pm_allocator_debug_free_sized + +#endif diff --git a/prism/internal/arena.h b/prism/internal/arena.h new file mode 100644 index 0000000000..2e413b42bf --- /dev/null +++ b/prism/internal/arena.h @@ -0,0 +1,108 @@ +#ifndef PRISM_INTERNAL_ARENA_H +#define PRISM_INTERNAL_ARENA_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/flex_array.h" +#include "prism/compiler/force_inline.h" +#include "prism/compiler/inline.h" + +#include "prism/arena.h" + +#include <stddef.h> +#include <string.h> + +/* + * A single block of memory in the arena. Blocks are linked via prev pointers so + * they can be freed by walking the chain. + */ +typedef struct pm_arena_block { + /* The previous block in the chain (for freeing). */ + struct pm_arena_block *prev; + + /* The total usable bytes in data[]. */ + size_t capacity; + + /* The number of bytes consumed so far. */ + size_t used; + + /* The block's data. */ + char data[PM_FLEX_ARRAY_LENGTH]; +} pm_arena_block_t; + +/* + * A bump allocator. Allocations are made by bumping a pointer within the + * current block. When a block is full, a new block is allocated and linked to + * the previous one. All blocks are freed at once by walking the chain. + */ +struct pm_arena_t { + /* The active block (allocate from here). */ + pm_arena_block_t *current; + + /* The number of blocks allocated. */ + size_t block_count; +}; + +/* + * Free all blocks in the arena. After this call, all pointers returned by + * pm_arena_alloc and pm_arena_zalloc are invalid. + */ +void pm_arena_cleanup(pm_arena_t *arena); + +/* + * Ensure the arena has at least `capacity` bytes available in its current + * block, allocating a new block if necessary. This allows callers to + * pre-size the arena to avoid repeated small block allocations. + */ +void pm_arena_reserve(pm_arena_t *arena, size_t capacity); + +/* + * Slow path for pm_arena_alloc: allocate a new block and return a pointer to + * the first `size` bytes. Do not call directly — use pm_arena_alloc instead. + */ +void * pm_arena_alloc_slow(pm_arena_t *arena, size_t size); + +/* + * Allocate memory from the arena. The returned memory is NOT zeroed. This + * function is infallible — it aborts on allocation failure. + * + * The fast path (bump pointer within the current block) is inlined at each + * call site. The slow path (new block allocation) is out-of-line. + */ +static PRISM_FORCE_INLINE void * +pm_arena_alloc(pm_arena_t *arena, size_t size, size_t alignment) { + if (arena->current != NULL) { + size_t used_aligned = (arena->current->used + alignment - 1) & ~(alignment - 1); + size_t needed = used_aligned + size; + + if (used_aligned >= arena->current->used && needed >= used_aligned && needed <= arena->current->capacity) { + arena->current->used = needed; + return arena->current->data + used_aligned; + } + } + + return pm_arena_alloc_slow(arena, size); +} + +/* + * Allocate zero-initialized memory from the arena. This function is infallible + * — it aborts on allocation failure. + */ +static PRISM_INLINE void * +pm_arena_zalloc(pm_arena_t *arena, size_t size, size_t alignment) { + void *ptr = pm_arena_alloc(arena, size, alignment); + memset(ptr, 0, size); + return ptr; +} + +/* + * Allocate memory from the arena and copy the given data into it. This is a + * convenience wrapper around pm_arena_alloc + memcpy. + */ +static PRISM_INLINE void * +pm_arena_memdup(pm_arena_t *arena, const void *src, size_t size, size_t alignment) { + void *dst = pm_arena_alloc(arena, size, alignment); + memcpy(dst, src, size); + return dst; +} + +#endif diff --git a/prism/internal/bit.h b/prism/internal/bit.h new file mode 100644 index 0000000000..b0111a4c2c --- /dev/null +++ b/prism/internal/bit.h @@ -0,0 +1,42 @@ +#ifndef PRISM_INTERNAL_BIT_H +#define PRISM_INTERNAL_BIT_H + +#include "prism/compiler/inline.h" + +/* + * Count trailing zero bits in a 64-bit value. Used by SWAR identifier scanning + * to find the first non-matching byte in a word. + * + * Precondition: v must be nonzero. The result is undefined when v == 0 + * (matching the behavior of __builtin_ctzll and _BitScanForward64). + */ +#if defined(__GNUC__) || defined(__clang__) +#define pm_ctzll(v) ((unsigned) __builtin_ctzll(v)) +#elif defined(_MSC_VER) +#include <intrin.h> +#include <stdint.h> + +static PRISM_INLINE unsigned +pm_ctzll(uint64_t v) { + unsigned long index; + _BitScanForward64(&index, v); + return (unsigned) index; +} +#else +#include <stdint.h> + +static PRISM_INLINE unsigned +pm_ctzll(uint64_t v) { + unsigned c = 0; + v &= (uint64_t) (-(int64_t) v); + if (v & 0x00000000FFFFFFFFULL) c += 0; else c += 32; + if (v & 0x0000FFFF0000FFFFULL) c += 0; else c += 16; + if (v & 0x00FF00FF00FF00FFULL) c += 0; else c += 8; + if (v & 0x0F0F0F0F0F0F0F0FULL) c += 0; else c += 4; + if (v & 0x3333333333333333ULL) c += 0; else c += 2; + if (v & 0x5555555555555555ULL) c += 0; else c += 1; + return c; +} +#endif + +#endif diff --git a/prism/internal/buffer.h b/prism/internal/buffer.h new file mode 100644 index 0000000000..a849bbf8e6 --- /dev/null +++ b/prism/internal/buffer.h @@ -0,0 +1,91 @@ +#ifndef PRISM_INTERNAL_BUFFER_H +#define PRISM_INTERNAL_BUFFER_H + +#include "prism/compiler/format.h" + +#include "prism/buffer.h" + +#include <stdbool.h> +#include <stdint.h> + +/* + * A simple memory buffer that stores data in a contiguous block of memory. + */ +struct pm_buffer_t { + /* The length of the buffer in bytes. */ + size_t length; + + /* The capacity of the buffer in bytes that has been allocated. */ + size_t capacity; + + /* A pointer to the start of the buffer. */ + char *value; +}; + +/* Initialize a pm_buffer_t with the given capacity. */ +void pm_buffer_init(pm_buffer_t *buffer, size_t capacity); + +/* Free the memory held by the buffer. */ +void pm_buffer_cleanup(pm_buffer_t *buffer); + +/* Append the given amount of space as zeroes to the buffer. */ +void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length); + +/* Append a formatted string to the buffer. */ +void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3); + +/* Append a string to the buffer. */ +void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length); + +/* Append a list of bytes to the buffer. */ +void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length); + +/* Append a single byte to the buffer. */ +void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value); + +/* Append a 32-bit unsigned integer to the buffer as a variable-length integer. */ +void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value); + +/* Append a 32-bit signed integer to the buffer as a variable-length integer. */ +void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); + +/* Append a double to the buffer. */ +void pm_buffer_append_double(pm_buffer_t *buffer, double value); + +/* Append a unicode codepoint to the buffer. */ +bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value); + +/* + * The different types of escaping that can be performed by the buffer when + * appending a slice of Ruby source code. + */ +typedef enum { + PM_BUFFER_ESCAPING_RUBY, + PM_BUFFER_ESCAPING_JSON +} pm_buffer_escaping_t; + +/* Append a slice of source code to the buffer. */ +void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping); + +/* Prepend the given string to the buffer. */ +void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length); + +/* Concatenate one buffer onto another. */ +void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source); + +/* + * Clear the buffer by reducing its size to 0. This does not free the allocated + * memory, but it does allow the buffer to be reused. + */ +void pm_buffer_clear(pm_buffer_t *buffer); + +/* Strip the whitespace from the end of the buffer. */ +void pm_buffer_rstrip(pm_buffer_t *buffer); + +/* Checks if the buffer includes the given value. */ +size_t pm_buffer_index(const pm_buffer_t *buffer, char value); + +/* Insert the given string into the buffer at the given index. */ +void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length); + +#endif diff --git a/prism/internal/char.h b/prism/internal/char.h new file mode 100644 index 0000000000..9a58fba8c5 --- /dev/null +++ b/prism/internal/char.h @@ -0,0 +1,139 @@ +#ifndef PRISM_INTERNAL_CHAR_H +#define PRISM_INTERNAL_CHAR_H + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" +#include "prism/line_offset_list.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/* Bit flag for whitespace characters in pm_byte_table. */ +#define PRISM_CHAR_BIT_WHITESPACE (1 << 0) + +/* Bit flag for inline whitespace characters in pm_byte_table. */ +#define PRISM_CHAR_BIT_INLINE_WHITESPACE (1 << 1) + +/* + * A lookup table for classifying bytes. Each entry is a bitfield of + * PRISM_CHAR_BIT_* flags. Defined in char.c. + */ +extern const uint8_t pm_byte_table[256]; + +/* Returns true if the given character is a whitespace character. */ +static PRISM_FORCE_INLINE bool +pm_char_is_whitespace(const uint8_t b) { + return (pm_byte_table[b] & PRISM_CHAR_BIT_WHITESPACE) != 0; +} + +/* Returns true if the given character is an inline whitespace character. */ +static PRISM_FORCE_INLINE bool +pm_char_is_inline_whitespace(const uint8_t b) { + return (pm_byte_table[b] & PRISM_CHAR_BIT_INLINE_WHITESPACE) != 0; +} + +/* + * Returns the number of characters at the start of the string that are inline + * whitespace (space/tab). Scans the byte table directly for use in hot paths. + */ +static PRISM_FORCE_INLINE size_t +pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length) { + if (length <= 0) return 0; + size_t size = 0; + size_t maximum = (size_t) length; + while (size < maximum && (pm_byte_table[string[size]] & PRISM_CHAR_BIT_INLINE_WHITESPACE)) size++; + return size; +} + +/* + * Returns the number of characters at the start of the string that are + * whitespace. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are + * whitespace while also tracking the location of each newline. Disallows + * searching past the given maximum number of characters. + */ +size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_arena_t *arena, pm_line_offset_list_t *line_offsets, uint32_t start_offset); + +/* + * Returns the number of characters at the start of the string that are decimal + * digits. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are + * hexadecimal digits. Disallows searching past the given maximum number of + * characters. + */ +size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are octal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are decimal + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are + * hexadecimal digits or underscores. Disallows searching past the given maximum + * number of characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + +/* + * Returns the number of characters at the start of the string that are regexp + * options. Disallows searching past the given maximum number of characters. + */ +size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length); + +/* + * Returns the number of characters at the start of the string that are binary + * digits or underscores. Disallows searching past the given maximum number of + * characters. + * + * If multiple underscores are found in a row or if an underscore is + * found at the end of the number, then the invalid pointer is set to the index + * of the first invalid underscore. + */ +size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); + + +/* Returns true if the given character is a binary digit. */ +bool pm_char_is_binary_digit(const uint8_t b); + +/* Returns true if the given character is an octal digit. */ +bool pm_char_is_octal_digit(const uint8_t b); + +/* Returns true if the given character is a decimal digit. */ +bool pm_char_is_decimal_digit(const uint8_t b); + +/* Returns true if the given character is a hexadecimal digit. */ +bool pm_char_is_hexadecimal_digit(const uint8_t b); + +#endif diff --git a/prism/internal/comments.h b/prism/internal/comments.h new file mode 100644 index 0000000000..bb3039a658 --- /dev/null +++ b/prism/internal/comments.h @@ -0,0 +1,20 @@ +#ifndef PRISM_INTERNAL_COMMENTS_H +#define PRISM_INTERNAL_COMMENTS_H + +#include "prism/comments.h" + +#include "prism/internal/list.h" + +/* A comment found while parsing. */ +struct pm_comment_t { + /* The embedded base node. */ + pm_list_node_t node; + + /* The location of the comment in the source. */ + pm_location_t location; + + /* The type of the comment. */ + pm_comment_type_t type; +}; + +#endif diff --git a/prism/internal/constant_pool.h b/prism/internal/constant_pool.h new file mode 100644 index 0000000000..fa2be783f5 --- /dev/null +++ b/prism/internal/constant_pool.h @@ -0,0 +1,117 @@ +#ifndef PRISM_INTERNAL_CONSTANT_POOL_H +#define PRISM_INTERNAL_CONSTANT_POOL_H + +#include "prism/constant_pool.h" + +#include "prism/arena.h" + +#include <stdbool.h> + +/* A constant in the pool which effectively stores a string. */ +struct pm_constant_t { + /* A pointer to the start of the string. */ + const uint8_t *start; + + /* The length of the string. */ + size_t length; +}; + +/* + * The type of bucket in the constant pool hash map. This determines how the + * bucket should be freed. + */ +typedef unsigned int pm_constant_pool_bucket_type_t; + +/* By default, each constant is a slice of the source. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0; + +/* An owned constant is one for which memory has been allocated. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1; + +/* A constant constant is known at compile time. */ +static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2; + +/* A bucket in the hash map. */ +typedef struct { + /* The incremental ID used for indexing back into the pool. */ + unsigned int id: 30; + + /* The type of the bucket, which determines how to free it. */ + pm_constant_pool_bucket_type_t type: 2; + + /* The hash of the bucket. */ + uint32_t hash; + + /* + * A pointer to the start of the string, stored directly in the bucket to + * avoid a pointer chase to the constants array during probing. + */ + const uint8_t *start; + + /* The length of the string. */ + size_t length; +} pm_constant_pool_bucket_t; + +/* The overall constant pool, which stores constants found while parsing. */ +struct pm_constant_pool_t { + /* The buckets in the hash map. */ + pm_constant_pool_bucket_t *buckets; + + /* The constants that are stored in the buckets. */ + pm_constant_t *constants; + + /* The number of buckets in the hash map. */ + uint32_t size; + + /* The number of buckets that have been allocated in the hash map. */ + uint32_t capacity; +}; + +/* + * When we allocate constants into the pool, we reserve 0 to mean that the slot + * is not yet filled. This constant is reused in other places to indicate the + * lack of a constant id. + */ +#define PM_CONSTANT_ID_UNSET 0 + +/* Initialize a list of constant ids with a given capacity. */ +void pm_constant_id_list_init_capacity(pm_arena_t *arena, pm_constant_id_list_t *list, size_t capacity); + +/* Insert a constant id into a list of constant ids at the specified index. */ +void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id); + +/* Checks if the current constant id list includes the given constant id. */ +bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id); + +/* Initialize a new constant pool with a given capacity. */ +void pm_constant_pool_init(pm_arena_t *arena, pm_constant_pool_t *pool, uint32_t capacity); + +/* Return a pointer to the constant indicated by the given constant id. */ +pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id); + +/* + * Find a constant in a constant pool. Returns the id of the constant, or 0 if + * the constant is not found. + */ +pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool that is a slice of a source string. + * Returns the id of the constant, or 0 if any potential calls to resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_shared(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool from memory that is now owned by the + * constant pool. Returns the id of the constant, or 0 if any potential calls to + * resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_owned(pm_arena_t *arena, pm_constant_pool_t *pool, uint8_t *start, size_t length); + +/* + * Insert a constant into a constant pool from memory that is constant. Returns + * the id of the constant, or 0 if any potential calls to resize fail. + */ +pm_constant_id_t pm_constant_pool_insert_constant(pm_arena_t *arena, pm_constant_pool_t *pool, const uint8_t *start, size_t length); + +#endif diff --git a/prism/encoding.h b/prism/internal/encoding.h index 5f7724821f..62392ef970 100644 --- a/prism/encoding.h +++ b/prism/internal/encoding.h @@ -1,128 +1,95 @@ -/** - * @file encoding.h - * - * The encoding interface and implementations used by the parser. - */ -#ifndef PRISM_ENCODING_H -#define PRISM_ENCODING_H - -#include "prism/defines.h" -#include "prism/util/pm_strncasecmp.h" +#ifndef PRISM_INTERNAL_ENCODING_H +#define PRISM_INTERNAL_ENCODING_H -#include <assert.h> #include <stdbool.h> #include <stddef.h> #include <stdint.h> -/** +/* * This struct defines the functions necessary to implement the encoding * interface so we can determine how many bytes the subsequent character takes. * Each callback should return the number of bytes, or 0 if the next bytes are * invalid for the encoding and type. */ typedef struct { - /** + /* * Return the number of bytes that the next character takes if it is valid * in the encoding. Does not read more than n bytes. It is assumed that n is * at least 1. */ size_t (*char_width)(const uint8_t *b, ptrdiff_t n); - /** + /* * Return the number of bytes that the next character takes if it is valid * in the encoding and is alphabetical. Does not read more than n bytes. It * is assumed that n is at least 1. */ size_t (*alpha_char)(const uint8_t *b, ptrdiff_t n); - /** + /* * Return the number of bytes that the next character takes if it is valid * in the encoding and is alphanumeric. Does not read more than n bytes. It * is assumed that n is at least 1. */ size_t (*alnum_char)(const uint8_t *b, ptrdiff_t n); - /** + /* * Return true if the next character is valid in the encoding and is an * uppercase character. Does not read more than n bytes. It is assumed that * n is at least 1. */ bool (*isupper_char)(const uint8_t *b, ptrdiff_t n); - /** + /* * The name of the encoding. This should correspond to a value that can be * passed to Encoding.find in Ruby. */ const char *name; - /** - * Return true if the encoding is a multibyte encoding. - */ + /* Return true if the encoding is a multibyte encoding. */ bool multibyte; } pm_encoding_t; -/** +/* * All of the lookup tables use the first bit of each embedded byte to indicate * whether the codepoint is alphabetical. */ #define PRISM_ENCODING_ALPHABETIC_BIT 1 << 0 -/** +/* * All of the lookup tables use the second bit of each embedded byte to indicate * whether the codepoint is alphanumeric. */ #define PRISM_ENCODING_ALPHANUMERIC_BIT 1 << 1 -/** +/* * All of the lookup tables use the third bit of each embedded byte to indicate * whether the codepoint is uppercase. */ #define PRISM_ENCODING_UPPERCASE_BIT 1 << 2 -/** - * Return the size of the next character in the UTF-8 encoding. - * - * @param b The bytes to read. - * @param n The number of bytes that can be read. - * @returns The number of bytes that the next character takes if it is valid in - * the encoding, or 0 if it is not. - */ +/* Return the size of the next character in the UTF-8 encoding. */ size_t pm_encoding_utf_8_char_width(const uint8_t *b, ptrdiff_t n); -/** +/* * Return the size of the next character in the UTF-8 encoding if it is an * alphabetical character. - * - * @param b The bytes to read. - * @param n The number of bytes that can be read. - * @returns The number of bytes that the next character takes if it is valid in - * the encoding, or 0 if it is not. */ size_t pm_encoding_utf_8_alpha_char(const uint8_t *b, ptrdiff_t n); -/** +/* * Return the size of the next character in the UTF-8 encoding if it is an * alphanumeric character. - * - * @param b The bytes to read. - * @param n The number of bytes that can be read. - * @returns The number of bytes that the next character takes if it is valid in - * the encoding, or 0 if it is not. */ size_t pm_encoding_utf_8_alnum_char(const uint8_t *b, ptrdiff_t n); -/** +/* * Return true if the next character in the UTF-8 encoding if it is an uppercase * character. - * - * @param b The bytes to read. - * @param n The number of bytes that can be read. - * @returns True if the next character is valid in the encoding and is an - * uppercase character, or false if it is not. */ bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n); -/** +/* * This lookup table is referenced in both the UTF-8 encoding file and the * parser directly in order to speed up the default encoding processing. It is * used to indicate whether a character is alphabetical, alphanumeric, or @@ -130,9 +97,7 @@ bool pm_encoding_utf_8_isupper_char(const uint8_t *b, ptrdiff_t n); */ extern const uint8_t pm_encoding_unicode_table[256]; -/** - * These are all of the encodings that prism supports. - */ +/* These are all of the encodings that prism supports. */ typedef enum { PM_ENCODING_UTF_8 = 0, PM_ENCODING_US_ASCII, @@ -140,8 +105,8 @@ typedef enum { PM_ENCODING_EUC_JP, PM_ENCODING_WINDOWS_31J, -// We optionally support excluding the full set of encodings to only support the -// minimum necessary to process Ruby code without encoding comments. +/* We optionally support excluding the full set of encodings to only support the + * minimum necessary to process Ruby code without encoding comments. */ #ifndef PRISM_ENCODING_EXCLUDE_FULL PM_ENCODING_BIG5, PM_ENCODING_BIG5_HKSCS, @@ -233,50 +198,44 @@ typedef enum { PM_ENCODING_MAXIMUM } pm_encoding_type_t; -/** - * This is the table of all of the encodings that prism supports. - */ +/* This is the table of all of the encodings that prism supports. */ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; -/** +/* * This is the default UTF-8 encoding. We need a reference to it to quickly * create parsers. */ #define PM_ENCODING_UTF_8_ENTRY (&pm_encodings[PM_ENCODING_UTF_8]) -/** +/* * This is the US-ASCII encoding. We need a reference to it to be able to * compare against it when a string is being created because it could possibly * need to fall back to ASCII-8BIT. */ #define PM_ENCODING_US_ASCII_ENTRY (&pm_encodings[PM_ENCODING_US_ASCII]) -/** +/* * This is the ASCII-8BIT encoding. We need a reference to it so that pm_strpbrk * can compare against it because invalid multibyte characters are not a thing * in this encoding. It is also needed for handling Regexp encoding flags. */ #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) -/** +/* * This is the EUC-JP encoding. We need a reference to it to quickly process * regular expression modifiers. */ #define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) -/** +/* * This is the Windows-31J encoding. We need a reference to it to quickly * process regular expression modifiers. */ #define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) -/** +/* * Parse the given name of an encoding and return a pointer to the corresponding * encoding struct if one can be found, otherwise return NULL. - * - * @param start A pointer to the first byte of the name. - * @param end A pointer to the last byte of the name. - * @returns A pointer to the encoding struct if one is found, otherwise NULL. */ const pm_encoding_t * pm_encoding_find(const uint8_t *start, const uint8_t *end); diff --git a/prism/internal/integer.h b/prism/internal/integer.h new file mode 100644 index 0000000000..7c9767e323 --- /dev/null +++ b/prism/internal/integer.h @@ -0,0 +1,68 @@ +/* + * This module provides functions for working with arbitrary-sized integers. + */ +#ifndef PRISM_INTERNAL_INTEGER_H +#define PRISM_INTERNAL_INTEGER_H + +#include "prism/buffer.h" +#include "prism/integer.h" + +#include <stdint.h> + +/* + * An enum controlling the base of an integer. It is expected that the base is + * already known before parsing the integer, even though it could be derived + * from the string itself. + */ +typedef enum { + /* The default decimal base, with no prefix. Leading 0s will be ignored. */ + PM_INTEGER_BASE_DEFAULT, + + /* The binary base, indicated by a 0b or 0B prefix. */ + PM_INTEGER_BASE_BINARY, + + /* The octal base, indicated by a 0, 0o, or 0O prefix. */ + PM_INTEGER_BASE_OCTAL, + + /* The decimal base, indicated by a 0d, 0D, or empty prefix. */ + PM_INTEGER_BASE_DECIMAL, + + /* The hexadecimal base, indicated by a 0x or 0X prefix. */ + PM_INTEGER_BASE_HEXADECIMAL, + + /* + * An unknown base, in which case pm_integer_parse will derive it based on + * the content of the string. This is less efficient and does more + * comparisons, so if callers know the base ahead of time, they should use + * that instead. + */ + PM_INTEGER_BASE_UNKNOWN +} pm_integer_base_t; + +/* + * Parse an integer from a string. This assumes that the format of the integer + * has already been validated, as internal validation checks are not performed + * here. + */ +void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end); + +/* + * Compare two integers. This function returns -1 if the left integer is less + * than the right integer, 0 if they are equal, and 1 if the left integer is + * greater than the right integer. + */ +int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right); + +/* + * Reduce a ratio of integers to its simplest form. + * + * If either the numerator or denominator do not fit into a 32-bit integer, then + * this function is a no-op. In the future, we may consider reducing even the + * larger numbers, but for now we're going to keep it simple. + */ +void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator); + +/* Convert an integer to a decimal string. */ +void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer); + +#endif diff --git a/prism/internal/isinf.h b/prism/internal/isinf.h new file mode 100644 index 0000000000..41c160f56d --- /dev/null +++ b/prism/internal/isinf.h @@ -0,0 +1,16 @@ +#ifndef PRISM_INTERNAL_ISINF_H +#define PRISM_INTERNAL_ISINF_H + +/* + * isinf on POSIX systems accepts a float, a double, or a long double. But mingw + * didn't provide an isinf macro, only an isinf function that only accepts + * floats, so we need to use _finite instead. + */ +#ifdef __MINGW64__ + #include <float.h> + #define PRISM_ISINF(x) (!_finite(x)) +#else + #define PRISM_ISINF(x) isinf(x) +#endif + +#endif diff --git a/prism/internal/line_offset_list.h b/prism/internal/line_offset_list.h new file mode 100644 index 0000000000..dac9f7052e --- /dev/null +++ b/prism/internal/line_offset_list.h @@ -0,0 +1,34 @@ +#ifndef PRISM_INTERNAL_LINE_OFFSET_LIST_H +#define PRISM_INTERNAL_LINE_OFFSET_LIST_H + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" +#include "prism/line_offset_list.h" + +/* Initialize a new line offset list with the given capacity. */ +void pm_line_offset_list_init(pm_arena_t *arena, pm_line_offset_list_t *list, size_t capacity); + +/* Clear out the offsets that have been appended to the list. */ +void pm_line_offset_list_clear(pm_line_offset_list_t *list); + +/* Append a new offset to the list (slow path with resize). */ +void pm_line_offset_list_append_slow(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor); + +/* Append a new offset to the list. */ +static PRISM_FORCE_INLINE void +pm_line_offset_list_append(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor) { + if (list->size < list->capacity) { + list->offsets[list->size++] = cursor; + } else { + pm_line_offset_list_append_slow(arena, list, cursor); + } +} + +/* + * Returns the line of the given offset. If the offset is not in the list, the + * line of the closest offset less than the given offset is returned. + */ +int32_t pm_line_offset_list_line(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line); + +#endif diff --git a/prism/util/pm_list.h b/prism/internal/list.h index 3512dee979..0ab59ef32a 100644 --- a/prism/util/pm_list.h +++ b/prism/internal/list.h @@ -1,19 +1,9 @@ -/** - * @file pm_list.h - * - * An abstract linked list. - */ -#ifndef PRISM_LIST_H -#define PRISM_LIST_H +#ifndef PRISM_INTERNAL_LIST_H +#define PRISM_INTERNAL_LIST_H -#include "prism/defines.h" - -#include <stdbool.h> #include <stddef.h> -#include <stdint.h> -#include <stdlib.h> -/** +/* * This struct represents an abstract linked list that provides common * functionality. It is meant to be used any time a linked list is necessary to * store data. @@ -44,54 +34,29 @@ * iteration and appending of new nodes. */ typedef struct pm_list_node { - /** A pointer to the next node in the list. */ + /* A pointer to the next node in the list. */ struct pm_list_node *next; } pm_list_node_t; -/** +/* * This represents the overall linked list. It keeps a pointer to the head and * tail so that iteration is easy and pushing new nodes is easy. */ typedef struct { - /** The size of the list. */ + /* The size of the list. */ size_t size; - /** A pointer to the head of the list. */ + /* A pointer to the head of the list. */ pm_list_node_t *head; - /** A pointer to the tail of the list. */ + /* A pointer to the tail of the list. */ pm_list_node_t *tail; } pm_list_t; -/** - * Returns true if the given list is empty. - * - * @param list The list to check. - * @return True if the given list is empty, otherwise false. - */ -PRISM_EXPORTED_FUNCTION bool pm_list_empty_p(pm_list_t *list); +/* Returns the size of the list. */ +size_t pm_list_size(pm_list_t *list); -/** - * Returns the size of the list. - * - * @param list The list to check. - * @return The size of the list. - */ -PRISM_EXPORTED_FUNCTION size_t pm_list_size(pm_list_t *list); - -/** - * Append a node to the given list. - * - * @param list The list to append to. - * @param node The node to append. - */ +/* Append a node to the given list. */ void pm_list_append(pm_list_t *list, pm_list_node_t *node); -/** - * Deallocate the internal state of the given list. - * - * @param list The list to free. - */ -PRISM_EXPORTED_FUNCTION void pm_list_free(pm_list_t *list); - #endif diff --git a/prism/internal/magic_comments.h b/prism/internal/magic_comments.h new file mode 100644 index 0000000000..72a581c5d7 --- /dev/null +++ b/prism/internal/magic_comments.h @@ -0,0 +1,23 @@ +#ifndef PRISM_INTERNAL_MAGIC_COMMENTS_H +#define PRISM_INTERNAL_MAGIC_COMMENTS_H + +#include "prism/magic_comments.h" + +#include "prism/internal/list.h" + +/* + * This is a node in the linked list of magic comments that we've found while + * parsing. + */ +struct pm_magic_comment_t { + /* The embedded base node. */ + pm_list_node_t node; + + /* The key of the magic comment. */ + pm_location_t key; + + /* The value of the magic comment. */ + pm_location_t value; +}; + +#endif diff --git a/prism/internal/memchr.h b/prism/internal/memchr.h new file mode 100644 index 0000000000..6f6b0bca30 --- /dev/null +++ b/prism/internal/memchr.h @@ -0,0 +1,15 @@ +#ifndef PRISM_INTERNAL_MEMCHR_H +#define PRISM_INTERNAL_MEMCHR_H + +#include "prism/internal/encoding.h" + +#include <stddef.h> + +/* + * We need to roll our own memchr to handle cases where the encoding changes and + * we need to search for a character in a buffer that could be the trailing byte + * of a multibyte character. + */ +const void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding); + +#endif diff --git a/prism/internal/node.h b/prism/internal/node.h new file mode 100644 index 0000000000..ca6d5616d7 --- /dev/null +++ b/prism/internal/node.h @@ -0,0 +1,32 @@ +#ifndef PRISM_INTERNAL_NODE_H +#define PRISM_INTERNAL_NODE_H + +#include "prism/node.h" + +#include "prism/compiler/force_inline.h" + +#include "prism/arena.h" + +/* + * Slow path for pm_node_list_append: grow the list and append the node. + * Do not call directly — use pm_node_list_append instead. + */ +void pm_node_list_append_slow(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node); + +/* Append a new node onto the end of the node list. */ +static PRISM_FORCE_INLINE void +pm_node_list_append(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) { + if (list->size < list->capacity) { + list->nodes[list->size++] = node; + } else { + pm_node_list_append_slow(arena, list, node); + } +} + +/* Prepend a new node onto the beginning of the node list. */ +void pm_node_list_prepend(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node); + +/* Concatenate the given node list onto the end of the other node list. */ +void pm_node_list_concat(pm_arena_t *arena, pm_node_list_t *list, pm_node_list_t *other); + +#endif diff --git a/prism/internal/options.h b/prism/internal/options.h new file mode 100644 index 0000000000..7e37742a8b --- /dev/null +++ b/prism/internal/options.h @@ -0,0 +1,212 @@ +#ifndef PRISM_INTERNAL_OPTIONS_H +#define PRISM_INTERNAL_OPTIONS_H + +#include "prism/options.h" + +/* A scope of locals surrounding the code that is being parsed. */ +struct pm_options_scope_t { + /* The number of locals in the scope. */ + size_t locals_count; + + /* The names of the locals in the scope. */ + pm_string_t *locals; + + /* Flags for the set of forwarding parameters in this scope. */ + uint8_t forwarding; +}; + +/* + * The version of Ruby syntax that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse in the same way as a specific version of CRuby would have. + */ +typedef enum { + /* + * If an explicit version is not provided, the current version of prism will + * be used. + */ + PM_OPTIONS_VERSION_UNSET = 0, + + /* The vendored version of prism in CRuby 3.3.x. */ + PM_OPTIONS_VERSION_CRUBY_3_3 = 1, + + /* The vendored version of prism in CRuby 3.4.x. */ + PM_OPTIONS_VERSION_CRUBY_3_4 = 2, + + /* The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_3_5 = 3, + + /* The vendored version of prism in CRuby 4.0.x. */ + PM_OPTIONS_VERSION_CRUBY_4_0 = 3, + + /* The vendored version of prism in CRuby 4.1.x. */ + PM_OPTIONS_VERSION_CRUBY_4_1 = 4, + + /* The current version of prism. */ + PM_OPTIONS_VERSION_LATEST = PM_OPTIONS_VERSION_CRUBY_4_1 +} pm_options_version_t; + +/* The options that can be passed to the parser. */ +struct pm_options_t { + /* + * The callback to call when additional switches are found in a shebang + * comment. + */ + pm_options_shebang_callback_t shebang_callback; + + /* + * Any additional data that should be passed along to the shebang callback + * if one was set. + */ + void *shebang_callback_data; + + /* The name of the file that is currently being parsed. */ + pm_string_t filepath; + + /* + * The line within the file that the parse starts on. This value is + * 1-indexed. + */ + int32_t line; + + /* + * The name of the encoding that the source file is in. Note that this must + * correspond to a name that can be found with Encoding.find in Ruby. + */ + pm_string_t encoding; + + /* The number of scopes surrounding the code that is being parsed. */ + size_t scopes_count; + + /* + * The scopes surrounding the code that is being parsed. For most parses + * this will be NULL, but for evals it will be the locals that are in scope + * surrounding the eval. Scopes are ordered from the outermost scope to the + * innermost one. + */ + pm_options_scope_t *scopes; + + /* + * The version of prism that we should be parsing with. This is used to + * allow consumers to specify which behavior they want in case they need to + * parse exactly as a specific version of CRuby. + */ + pm_options_version_t version; + + /* A bitset of the various options that were set on the command line. */ + uint8_t command_line; + + /* + * Whether or not the frozen string literal option has been set. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /* + * Whether or not the encoding magic comments should be respected. This is a + * niche use-case where you want to parse a file with a specific encoding + * but ignore any encoding magic comments at the top of the file. + */ + bool encoding_locked; + + /* + * When the file being parsed is the main script, the shebang will be + * considered for command-line flags (or for implicit -x). The caller needs + * to pass this information to the parser so that it can behave correctly. + */ + bool main_script; + + /* + * When the file being parsed is considered a "partial" script, jumps will + * not be marked as errors if they are not contained within loops/blocks. + * This is used in the case that you're parsing a script that you know will + * be embedded inside another script later, but you do not have that context + * yet. For example, when parsing an ERB template that will be evaluated + * inside another script. + */ + bool partial_script; + + /* + * Whether or not the parser should freeze the nodes that it creates. This + * makes it possible to have a deeply frozen AST that is safe to share + * between concurrency primitives. + */ + bool freeze; +}; + +/* Free the internal memory associated with the options. */ +void pm_options_cleanup(pm_options_t *options); + +/* + * Deserialize an options struct from the given binary string. This is used to + * pass options to the parser from an FFI call so that consumers of the library + * from an FFI perspective don't have to worry about the structure of our + * options structs. Since the source of these calls will be from Ruby + * implementation internals we assume it is from a trusted source. + * + * `data` is assumed to be a valid pointer pointing to well-formed data. The + * layout of this data should be the same every time, and is described below: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the filepath | + * | ... | the filepath bytes | + * | `4` | the line number | + * | `4` | the length the encoding | + * | ... | the encoding bytes | + * | `1` | frozen string literal | + * | `1` | -p command line option | + * | `1` | -n command line option | + * | `1` | -l command line option | + * | `1` | -a command line option | + * | `1` | the version | + * | `1` | encoding locked | + * | `1` | main script | + * | `1` | partial script | + * | `1` | freeze | + * | `4` | the number of scopes | + * | ... | the scopes | + * + * The version field is an enum, so it should be one of the following values: + * + * | value | version | + * | ----- | ------------------------- | + * | `0` | use the latest version of prism | + * | `1` | use the version of prism that is vendored in CRuby 3.3.0 | + * | `2` | use the version of prism that is vendored in CRuby 3.4.0 | + * | `3` | use the version of prism that is vendored in CRuby 4.0.0 | + * | `4` | use the version of prism that is vendored in CRuby 4.1.0 | + * + * Each scope is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the number of locals | + * | `1` | the forwarding flags | + * | ... | the locals | + * + * Each local is laid out as follows: + * + * | # bytes | field | + * | ------- | -------------------------- | + * | `4` | the length of the local | + * | ... | the local bytes | + * + * Some additional things to note about this layout: + * + * * The filepath can have a length of 0, in which case we'll consider it an + * empty string. + * * The line number should be 0-indexed. + * * The encoding can have a length of 0, in which case we'll use the default + * encoding (UTF-8). If it's not 0, it should correspond to a name of an + * encoding that can be passed to `Encoding.find` in Ruby. + * * The frozen string literal, encoding locked, main script, and partial script + * fields are booleans, so their values should be either 0 or 1. + * * The number of scopes can be 0. + */ +void pm_options_read(pm_options_t *options, const char *data); + +#endif diff --git a/prism/internal/parser.h b/prism/internal/parser.h new file mode 100644 index 0000000000..4320cf4029 --- /dev/null +++ b/prism/internal/parser.h @@ -0,0 +1,958 @@ +#ifndef PRISM_INTERNAL_PARSER_H +#define PRISM_INTERNAL_PARSER_H + +#include "prism/compiler/accel.h" + +#include "prism/internal/arena.h" +#include "prism/internal/constant_pool.h" +#include "prism/internal/encoding.h" +#include "prism/internal/list.h" +#include "prism/internal/options.h" +#include "prism/internal/static_literals.h" +#include "prism/internal/strpbrk.h" + +#include "prism/ast.h" +#include "prism/line_offset_list.h" +#include "prism/parser.h" + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> + +/* + * This enum provides various bits that represent different kinds of states that + * the lexer can track. This is used to determine which kind of token to return + * based on the context of the parser. + */ +typedef enum { + PM_LEX_STATE_BIT_BEG, + PM_LEX_STATE_BIT_END, + PM_LEX_STATE_BIT_ENDARG, + PM_LEX_STATE_BIT_ENDFN, + PM_LEX_STATE_BIT_ARG, + PM_LEX_STATE_BIT_CMDARG, + PM_LEX_STATE_BIT_MID, + PM_LEX_STATE_BIT_FNAME, + PM_LEX_STATE_BIT_DOT, + PM_LEX_STATE_BIT_CLASS, + PM_LEX_STATE_BIT_LABEL, + PM_LEX_STATE_BIT_LABELED, + PM_LEX_STATE_BIT_FITEM +} pm_lex_state_bit_t; + +/* + * This enum combines the various bits from the above enum into individual + * values that represent the various states of the lexer. + */ +typedef enum { + PM_LEX_STATE_NONE = 0, + PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), + PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), + PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), + PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), + PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), + PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), + PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), + PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), + PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), + PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), + PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), + PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), + PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), + PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, + PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, + PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN +} pm_lex_state_t; + +/* + * The type of quote that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_QUOTE_NONE, + PM_HEREDOC_QUOTE_SINGLE = '\'', + PM_HEREDOC_QUOTE_DOUBLE = '"', + PM_HEREDOC_QUOTE_BACKTICK = '`', +} pm_heredoc_quote_t; + +/* + * The type of indentation that a heredoc uses. + */ +typedef enum { + PM_HEREDOC_INDENT_NONE, + PM_HEREDOC_INDENT_DASH, + PM_HEREDOC_INDENT_TILDE, +} pm_heredoc_indent_t; + +/* + * All of the information necessary to store to lexing a heredoc. + */ +typedef struct { + /* A pointer to the start of the heredoc identifier. */ + const uint8_t *ident_start; + + /* The length of the heredoc identifier. */ + size_t ident_length; + + /* The type of quote that the heredoc uses. */ + pm_heredoc_quote_t quote; + + /* The type of indentation that the heredoc uses. */ + pm_heredoc_indent_t indent; +} pm_heredoc_lex_mode_t; + +/* + * When lexing Ruby source, the lexer has a small amount of state to tell which + * kind of token it is currently lexing. For example, when we find the start of + * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After + * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that + * are found as part of a string. + */ +typedef struct pm_lex_mode { + /* The type of this lex mode. */ + enum { + /* This state is used when any given token is being lexed. */ + PM_LEX_DEFAULT, + + /* + * This state is used when we're lexing as normal but inside an embedded + * expression of a string. + */ + PM_LEX_EMBEXPR, + + /* + * This state is used when we're lexing a variable that is embedded + * directly inside of a string with the # shorthand. + */ + PM_LEX_EMBVAR, + + /* This state is used when you are inside the content of a heredoc. */ + PM_LEX_HEREDOC, + + /* + * This state is used when we are lexing a list of tokens, as in a %w + * word list literal or a %i symbol list literal. + */ + PM_LEX_LIST, + + /* + * This state is used when a regular expression has been begun and we + * are looking for the terminator. + */ + PM_LEX_REGEXP, + + /* + * This state is used when we are lexing a string or a string-like + * token, as in string content with either quote or an xstring. + */ + PM_LEX_STRING + } mode; + + /* The data associated with this type of lex mode. */ + union { + struct { + /* This keeps track of the nesting level of the list. */ + size_t nesting; + + /* Whether or not interpolation is allowed in this list. */ + bool interpolation; + + /* + * When lexing a list, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* This is the terminator of the list literal. */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the list. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } list; + + struct { + /* + * This keeps track of the nesting level of the regular expression. + */ + size_t nesting; + + /* + * When lexing a regular expression, it takes into account balancing + * the terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* This is the terminator of the regular expression. */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the regular expression. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } regexp; + + struct { + /* This keeps track of the nesting level of the string. */ + size_t nesting; + + /* Whether or not interpolation is allowed in this string. */ + bool interpolation; + + /* + * Whether or not at the end of the string we should allow a :, + * which would indicate this was a dynamic symbol instead of a + * string. + */ + bool label_allowed; + + /* + * When lexing a string, it takes into account balancing the + * terminator if the terminator is one of (), [], {}, or <>. + */ + uint8_t incrementor; + + /* + * This is the terminator of the string. It is typically either a + * single or double quote. + */ + uint8_t terminator; + + /* + * This is the character set that should be used to delimit the + * tokens within the string. + */ + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE]; + } string; + + struct { + /* + * All of the data necessary to lex a heredoc. + */ + pm_heredoc_lex_mode_t base; + + /* + * This is the pointer to the character where lexing should resume + * once the heredoc has been completely processed. + */ + const uint8_t *next_start; + + /* + * This is used to track the amount of common whitespace on each + * line so that we know how much to dedent each line in the case of + * a tilde heredoc. + */ + size_t *common_whitespace; + + /* True if the previous token ended with a line continuation. */ + bool line_continuation; + } heredoc; + } as; + + /* The previous lex state so that it knows how to pop. */ + struct pm_lex_mode *prev; +} pm_lex_mode_t; + +/* + * We pre-allocate a certain number of lex states in order to avoid having to + * call malloc too many times while parsing. You really shouldn't need more than + * this because you only really nest deeply when doing string interpolation. + */ +#define PM_LEX_STACK_SIZE 4 + +/* + * While parsing, we keep track of a stack of contexts. This is helpful for + * error recovery so that we can pop back to a previous context when we hit a + * token that is understood by a parent context but not by the current context. + */ +typedef enum { + /* a null context, used for returning a value from a function */ + PM_CONTEXT_NONE = 0, + + /* a begin statement */ + PM_CONTEXT_BEGIN, + + /* an ensure statement with an explicit begin */ + PM_CONTEXT_BEGIN_ENSURE, + + /* a rescue else statement with an explicit begin */ + PM_CONTEXT_BEGIN_ELSE, + + /* a rescue statement with an explicit begin */ + PM_CONTEXT_BEGIN_RESCUE, + + /* expressions in block arguments using braces */ + PM_CONTEXT_BLOCK_BRACES, + + /* expressions in block arguments using do..end */ + PM_CONTEXT_BLOCK_KEYWORDS, + + /* an ensure statement within a do..end block */ + PM_CONTEXT_BLOCK_ENSURE, + + /* a rescue else statement within a do..end block */ + PM_CONTEXT_BLOCK_ELSE, + + /* expressions in block parameters `foo do |...| end ` */ + PM_CONTEXT_BLOCK_PARAMETERS, + + /* a rescue statement within a do..end block */ + PM_CONTEXT_BLOCK_RESCUE, + + /* a case when statements */ + PM_CONTEXT_CASE_WHEN, + + /* a case in statements */ + PM_CONTEXT_CASE_IN, + + /* a class declaration */ + PM_CONTEXT_CLASS, + + /* an ensure statement within a class statement */ + PM_CONTEXT_CLASS_ENSURE, + + /* a rescue else statement within a class statement */ + PM_CONTEXT_CLASS_ELSE, + + /* a rescue statement within a class statement */ + PM_CONTEXT_CLASS_RESCUE, + + /* a method definition */ + PM_CONTEXT_DEF, + + /* an ensure statement within a method definition */ + PM_CONTEXT_DEF_ENSURE, + + /* a rescue else statement within a method definition */ + PM_CONTEXT_DEF_ELSE, + + /* a rescue statement within a method definition */ + PM_CONTEXT_DEF_RESCUE, + + /* a method definition's parameters */ + PM_CONTEXT_DEF_PARAMS, + + /* a defined? expression */ + PM_CONTEXT_DEFINED, + + /* a method definition's default parameter */ + PM_CONTEXT_DEFAULT_PARAMS, + + /* an else clause */ + PM_CONTEXT_ELSE, + + /* an elsif clause */ + PM_CONTEXT_ELSIF, + + /* an interpolated expression */ + PM_CONTEXT_EMBEXPR, + + /* a for loop */ + PM_CONTEXT_FOR, + + /* a for loop's index */ + PM_CONTEXT_FOR_INDEX, + + /* an if statement */ + PM_CONTEXT_IF, + + /* a lambda expression with braces */ + PM_CONTEXT_LAMBDA_BRACES, + + /* a lambda expression with do..end */ + PM_CONTEXT_LAMBDA_DO_END, + + /* an ensure statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ENSURE, + + /* a rescue else statement within a lambda expression */ + PM_CONTEXT_LAMBDA_ELSE, + + /* a rescue statement within a lambda expression */ + PM_CONTEXT_LAMBDA_RESCUE, + + /* the predicate clause of a loop statement */ + PM_CONTEXT_LOOP_PREDICATE, + + /* the top level context */ + PM_CONTEXT_MAIN, + + /* a module declaration */ + PM_CONTEXT_MODULE, + + /* an ensure statement within a module statement */ + PM_CONTEXT_MODULE_ENSURE, + + /* a rescue else statement within a module statement */ + PM_CONTEXT_MODULE_ELSE, + + /* a rescue statement within a module statement */ + PM_CONTEXT_MODULE_RESCUE, + + /* a multiple target expression */ + PM_CONTEXT_MULTI_TARGET, + + /* a parenthesized expression */ + PM_CONTEXT_PARENS, + + /* an END block */ + PM_CONTEXT_POSTEXE, + + /* a predicate inside an if/elsif/unless statement */ + PM_CONTEXT_PREDICATE, + + /* a BEGIN block */ + PM_CONTEXT_PREEXE, + + /* a modifier rescue clause */ + PM_CONTEXT_RESCUE_MODIFIER, + + /* a singleton class definition */ + PM_CONTEXT_SCLASS, + + /* an ensure statement with a singleton class */ + PM_CONTEXT_SCLASS_ENSURE, + + /* a rescue else statement with a singleton class */ + PM_CONTEXT_SCLASS_ELSE, + + /* a rescue statement with a singleton class */ + PM_CONTEXT_SCLASS_RESCUE, + + /* a ternary expression */ + PM_CONTEXT_TERNARY, + + /* an unless statement */ + PM_CONTEXT_UNLESS, + + /* an until statement */ + PM_CONTEXT_UNTIL, + + /* a while statement */ + PM_CONTEXT_WHILE, +} pm_context_t; + +/* This is a node in a linked list of contexts. */ +typedef struct pm_context_node { + /* The context that this node represents. */ + pm_context_t context; + + /* A pointer to the previous context in the linked list. */ + struct pm_context_node *prev; +} pm_context_node_t; + +/* The type of shareable constant value that can be set. */ +typedef uint8_t pm_shareable_constant_value_t; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; +static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; + +/* + * This tracks an individual local variable in a certain lexical context, as + * well as the number of times is it read. + */ +typedef struct { + /* The name of the local variable. */ + pm_constant_id_t name; + + /* The location of the local variable in the source. */ + pm_location_t location; + + /* The index of the local variable in the local table. */ + uint32_t index; + + /* The number of times the local variable is read. */ + uint32_t reads; + + /* The hash of the local variable. */ + uint32_t hash; +} pm_local_t; + +/* + * This is a set of local variables in a certain lexical context (method, class, + * module, etc.). We need to track how many times these variables are read in + * order to warn if they only get written. + */ +typedef struct pm_locals { + /* The number of local variables in the set. */ + uint32_t size; + + /* The capacity of the local variables set. */ + uint32_t capacity; + + /* + * A bloom filter over constant IDs stored in this set. Used to quickly + * reject lookups for names that are definitely not present, avoiding the + * cost of a linear scan or hash probe. + */ + uint32_t bloom; + + /* The nullable allocated memory for the local variables in the set. */ + pm_local_t *locals; +} pm_locals_t; + +/* The flags about scope parameters that can be set. */ +typedef uint8_t pm_scope_parameters_t; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; +static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; + +/* + * This struct represents a node in a linked list of scopes. Some scopes can see + * into their parent scopes, while others cannot. + */ +typedef struct pm_scope { + /* A pointer to the previous scope in the linked list. */ + struct pm_scope *previous; + + /* The IDs of the locals in the given scope. */ + pm_locals_t locals; + + /* + * This is a list of the implicit parameters contained within the block. + * These will be processed after the block is parsed to determine the kind + * of parameters node that should be used and to check if any errors need to + * be added. + */ + pm_node_list_t implicit_parameters; + + /* + * This is a bitfield that indicates the parameters that are being used in + * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. + * There are three different kinds of parameters that can be used in a + * scope: + * + * - Ordinary parameters (e.g., def foo(bar); end) + * - Numbered parameters (e.g., def foo; _1; end) + * - The it parameter (e.g., def foo; it; end) + * + * If ordinary parameters are being used, then certain parameters can be + * forwarded to another method/structure. Those are indicated by four + * additional bits in the params field. For example, some combinations of: + * + * - def foo(*); end + * - def foo(**); end + * - def foo(&); end + * - def foo(...); end + */ + pm_scope_parameters_t parameters; + + /* + * The current state of constant shareability for this scope. This is + * changed by magic shareable_constant_value comments. + */ + pm_shareable_constant_value_t shareable_constant; + + /* + * A boolean indicating whether or not this scope can see into its parent. + * If closed is true, then the scope cannot see into its parent. + */ + bool closed; +} pm_scope_t; + +/* + * A struct that represents a stack of boolean values. + */ +typedef uint32_t pm_state_stack_t; + +/* + * This struct represents the overall parser. It contains a reference to the + * source file, as well as pointers that indicate where in the source it's + * currently parsing. It also contains the most recent and current token that + * it's considering. + */ +struct pm_parser_t { + /* The arena used for all AST-lifetime allocations. Caller-owned. */ + pm_arena_t *arena; + + /* The arena used for parser metadata (comments, diagnostics, etc.). */ + pm_arena_t metadata_arena; + + /* + * The next node identifier that will be assigned. This is a unique + * identifier used to track nodes such that the syntax tree can be dropped + * but the node can be found through another parse. + */ + uint32_t node_id; + + /* + * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant + * constant pool lookups when the same token is resolved multiple times + * (e.g., once during lexing for local variable detection, and again + * during parsing for node creation). + */ + struct { + const uint8_t *start; + const uint8_t *end; + pm_constant_id_t id; + } constant_cache; + + /* The current state of the lexer. */ + pm_lex_state_t lex_state; + + /* Tracks the current nesting of (), [], and {}. */ + int enclosure_nesting; + + /* + * Used to temporarily track the nesting of enclosures to determine if a { + * is the beginning of a lambda following the parameters of a lambda. + */ + int lambda_enclosure_nesting; + + /* + * Used to track the nesting of braces to ensure we get the correct value + * when we are interpolating blocks with braces. + */ + int brace_nesting; + + /* + * The stack used to determine if a do keyword belongs to the predicate of a + * while, until, or for loop. + */ + pm_state_stack_t do_loop_stack; + + /* + * The stack used to determine if a do keyword belongs to the beginning of a + * block. + */ + pm_state_stack_t accepts_block_stack; + + /* A stack of lex modes. */ + struct { + /* The current mode of the lexer. */ + pm_lex_mode_t *current; + + /* The stack of lexer modes. */ + pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; + + /* The current index into the lexer mode stack. */ + size_t index; + } lex_modes; + + /* The pointer to the start of the source. */ + const uint8_t *start; + + /* The pointer to the end of the source. */ + const uint8_t *end; + + /* The previous token we were considering. */ + pm_token_t previous; + + /* The current token we're considering. */ + pm_token_t current; + + /* + * This is a special field set on the parser when we need the parser to jump + * to a specific location when lexing the next token, as opposed to just + * using the end of the previous token. Normally this is NULL. + */ + const uint8_t *next_start; + + /* + * This field indicates the end of a heredoc whose identifier was found on + * the current line. If another heredoc is found on the same line, then this + * will be moved forward to the end of that heredoc. If no heredocs are + * found on a line then this is NULL. + */ + const uint8_t *heredoc_end; + + /* The list of comments that have been found while parsing. */ + pm_list_t comment_list; + + /* The list of magic comments that have been found while parsing. */ + pm_list_t magic_comment_list; + + /* + * An optional location that represents the location of the __END__ marker + * and the rest of the content of the file. This content is loaded into the + * DATA constant when the file being parsed is the main file being executed. + */ + pm_location_t data_loc; + + /* The list of warnings that have been found while parsing. */ + pm_list_t warning_list; + + /* The list of errors that have been found while parsing. */ + pm_list_t error_list; + + /* The current local scope. */ + pm_scope_t *current_scope; + + /* The current parsing context. */ + pm_context_node_t *current_context; + + /* + * The hash keys for the hash that is currently being parsed. This is not + * usually necessary because it can pass it down the various call chains, + * but in the event that you're parsing a hash that is being directly + * pushed into another hash with **, we need to share the hash keys so that + * we can warn for the nested hash as well. + */ + pm_static_literals_t *current_hash_keys; + + /* + * The encoding functions for the current file is attached to the parser as + * it's parsing so that it can change with a magic comment. + */ + const pm_encoding_t *encoding; + + /* + * When the encoding that is being used to parse the source is changed by + * prism, we provide the ability here to call out to a user-defined + * function. + */ + pm_encoding_changed_callback_t encoding_changed_callback; + + /* + * This pointer indicates where a comment must start if it is to be + * considered an encoding comment. + */ + const uint8_t *encoding_comment_start; + + /* + * When you are lexing through a file, the lexer needs all of the information + * that the parser additionally provides (for example, the local table). So if + * you want to properly lex Ruby, you need to actually lex it in the context of + * the parser. In order to provide this functionality, we optionally allow a + * struct to be attached to the parser that calls back out to a user-provided + * callback when each token is lexed. + */ + struct { + /* + * This is the callback that is called when a token is lexed. It is + * passed the opaque data pointer, the parser, and the token that was + * lexed. + */ + pm_lex_callback_t callback; + + /* + * This opaque pointer is used to provide whatever information the user + * deemed necessary to the callback. In our case we use it to pass the + * array that the tokens get appended into. + */ + void *data; + } lex_callback; + + /* + * This is the path of the file being parsed. We use the filepath when + * constructing SourceFileNodes. + */ + pm_string_t filepath; + + /* + * This constant pool keeps all of the constants defined throughout the file + * so that we can reference them later. + */ + pm_constant_pool_t constant_pool; + + /* This is the list of line offsets in the source file. */ + pm_line_offset_list_t line_offsets; + + /* + * State communicated from the lexer to the parser for integer tokens. + */ + struct { + /* + * A flag indicating the base of the integer (binary, octal, decimal, + * hexadecimal). Set during lexing and read during node creation. + */ + pm_node_flags_t base; + + /* + * When lexing a decimal integer that fits in a uint32_t, we compute + * the value during lexing to avoid re-scanning the digits during + * parsing. If lexed is true, this holds the result and + * pm_integer_parse can be skipped. + */ + uint32_t value; + + /* Whether value holds a valid pre-computed integer. */ + bool lexed; + } integer; + + /* + * This string is used to pass information from the lexer to the parser. It + * is particularly necessary because of escape sequences. + */ + pm_string_t current_string; + + /* + * The line number at the start of the parse. This will be used to offset + * the line numbers of all of the locations. + */ + int32_t start_line; + + /* + * When a string-like expression is being lexed, any byte or escape sequence + * that resolves to a value whose top bit is set (i.e., >= 0x80) will + * explicitly set the encoding to the same encoding as the source. + * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that + * resolves to a value whose top bit is set, then the encoding will be + * explicitly set to UTF-8. + * + * The _next_ time this happens, if the encoding that is about to become the + * explicitly set encoding does not match the previously set explicit + * encoding, a mixed encoding error will be emitted. + * + * When the expression is finished being lexed, the explicit encoding + * controls the encoding of the expression. For the most part this means + * that the expression will either be encoded in the source encoding or + * UTF-8. This holds for all encodings except US-ASCII. If the source is + * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the + * expression will be encoded as ASCII-8BIT. + * + * Note that if the expression is a list, different elements within the same + * list can have different encodings, so this will get reset between each + * element. Furthermore all of this only applies to lists that support + * interpolation, because otherwise escapes that could change the encoding + * are ignored. + * + * At first glance, it may make more sense for this to live on the lexer + * mode, but we need it here to communicate back to the parser for character + * literals that do not push a new lexer mode. + */ + const pm_encoding_t *explicit_encoding; + + /* + * When parsing block exits (e.g., break, next, redo), we need to validate + * that they are in correct contexts. For the most part we can do this by + * looking at our parent contexts. However, modifier while and until + * expressions can change that context to make block exits valid. In these + * cases, we need to keep track of the block exits and then validate them + * after the expression has been parsed. + * + * We use a pointer here because we don't want to keep a whole list attached + * since this will only be used in the context of begin/end expressions. + */ + pm_node_list_t *current_block_exits; + + /* The version of prism that we should use to parse. */ + pm_options_version_t version; + + /* The command line flags given from the options. */ + uint8_t command_line; + + /* + * Whether or not we have found a frozen_string_literal magic comment with + * a true or false value. + * May be: + * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED + * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET + */ + int8_t frozen_string_literal; + + /* + * Whether or not we are parsing an eval string. This impacts whether or not + * we should evaluate if block exits/yields are valid. + */ + bool parsing_eval; + + /* + * Whether or not we are parsing a "partial" script, which is a script that + * will be evaluated in the context of another script, so we should not + * check jumps (next/break/etc.) for validity. + */ + bool partial_script; + + /* Whether or not we're at the beginning of a command. */ + bool command_start; + + /* + * Whether or not we're currently parsing the body of an endless method + * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be + * consumed by commands (it should bubble up to the outer context). + */ + bool in_endless_def_body; + + /* Whether or not we're currently recovering from a syntax error. */ + bool recovering; + + /* + * Whether or not the source being parsed could become valid if more input + * were appended. This is set to false when the parser encounters a token + * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to + * merely incomplete. + */ + bool continuable; + + /* + * This is very specialized behavior for when you want to parse in a context + * that does not respect encoding comments. Its main use case is translating + * into the whitequark/parser AST which re-encodes source files in UTF-8 + * before they are parsed and ignores encoding comments. + */ + bool encoding_locked; + + /* + * Whether or not the encoding has been changed by a magic comment. We use + * this to provide a fast path for the lexer instead of going through the + * function pointer. + */ + bool encoding_changed; + + /* + * This flag indicates that we are currently parsing a pattern matching + * expression and impacts that calculation of newlines. + */ + bool pattern_matching_newlines; + + /* This flag indicates that we are currently parsing a keyword argument. */ + bool in_keyword_arg; + + /* + * Whether or not the parser has seen a token that has semantic meaning + * (i.e., a token that is not a comment or whitespace). + */ + bool semantic_token_seen; + + /* + * By default, Ruby always warns about mismatched indentation. This can be + * toggled with a magic comment. + */ + bool warn_mismatched_indentation; + +#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR) + /* + * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding + * the nibble-based tables on every call when the charset hasn't changed + * (which is the common case during string/regex/list lexing). + */ + struct { + /* The cached charset (null-terminated, max 11 chars + NUL). */ + uint8_t charset[12]; + + /* Nibble-based low lookup table for SIMD matching. */ + uint8_t low_lut[16]; + + /* Nibble-based high lookup table for SIMD matching. */ + uint8_t high_lut[16]; + + /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */ + uint64_t table[4]; + } strpbrk_cache; +#endif +}; + +/* + * Initialize a parser with the given start and end pointers. + */ +void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options); + +/* + * Free the memory held by the given parser. + * + * This does not free the `pm_options_t` object that was used to initialize the + * parser. + */ +void pm_parser_cleanup(pm_parser_t *parser); + +#endif diff --git a/prism/internal/regexp.h b/prism/internal/regexp.h new file mode 100644 index 0000000000..3710c984fc --- /dev/null +++ b/prism/internal/regexp.h @@ -0,0 +1,41 @@ +#ifndef PRISM_INTERNAL_REGEXP_H +#define PRISM_INTERNAL_REGEXP_H + +#include "prism/ast.h" +#include "prism/parser.h" + +/* + * Accumulation state for named capture groups found during regexp parsing. + * The caller initializes this with the call node and passes it to + * pm_regexp_parse. The regexp parser populates match and names as groups + * are found. + */ +typedef struct { + /* The call node wrapping the regular expression node (for =~). */ + pm_call_node_t *call; + + /* The match write node being built, or NULL if no captures found yet. */ + pm_match_write_node_t *match; + + /* The list of capture names found so far (for deduplication). */ + pm_constant_id_list_t names; +} pm_regexp_name_data_t; + +/* + * Callback invoked by pm_regexp_parse() for each named capture group found. + */ +typedef void (*pm_regexp_name_callback_t)(pm_parser_t *parser, const pm_string_t *name, bool shared, pm_regexp_name_data_t *data); + +/* + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Returns the encoding flags to set on the node. + */ +PRISM_EXPORTED_FUNCTION pm_node_flags_t pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); + +/* + * Parse an interpolated regular expression for named capture groups only. + * No encoding validation is performed. + */ +void pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data); + +#endif diff --git a/prism/internal/serialize.h b/prism/internal/serialize.h new file mode 100644 index 0000000000..e611a0374b --- /dev/null +++ b/prism/internal/serialize.h @@ -0,0 +1,34 @@ +#ifndef PRISM_INTERNAL_SERIALIZE_H +#define PRISM_INTERNAL_SERIALIZE_H + +#include "prism/internal/encoding.h" +#include "prism/internal/list.h" + +#include "prism/ast.h" +#include "prism/buffer.h" +#include "prism/excludes.h" +#include "prism/parser.h" + +/* We optionally support serializing to a binary string. For systems that do not + * want or need this functionality, it can be turned off with the + * PRISM_EXCLUDE_SERIALIZATION define. */ +#ifndef PRISM_EXCLUDE_SERIALIZATION + +/* + * Serialize the given list of comments to the given buffer. + */ +void pm_serialize_comment_list(pm_list_t *list, pm_buffer_t *buffer); + +/* + * Serialize the name of the encoding to the buffer. + */ +void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer); + +/* + * Serialize the encoding, metadata, nodes, and constant pool. + */ +void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); + +#endif + +#endif diff --git a/prism/internal/source.h b/prism/internal/source.h new file mode 100644 index 0000000000..b3c2b55be3 --- /dev/null +++ b/prism/internal/source.h @@ -0,0 +1,72 @@ +#ifndef PRISM_INTERNAL_SOURCE_H +#define PRISM_INTERNAL_SOURCE_H + +#include "prism/source.h" +#include "prism/buffer.h" + +#include <stdbool.h> + +/* + * The type of source, which determines cleanup behavior. + */ +typedef enum { + /* Wraps existing constant memory, no cleanup. */ + PM_SOURCE_CONSTANT, + + /* Wraps existing shared memory (non-owning slice), no cleanup. */ + PM_SOURCE_SHARED, + + /* Owns a heap-allocated buffer, freed on cleanup. */ + PM_SOURCE_OWNED, + + /* Memory-mapped file, unmapped on cleanup. */ + PM_SOURCE_MAPPED, + + /* Stream source backed by a pm_buffer_t. */ + PM_SOURCE_STREAM +} pm_source_type_t; + +/* + * The internal representation of a source. + */ +struct pm_source_t { + /* A pointer to the start of the source data. */ + const uint8_t *source; + + /* The length of the source data in bytes. */ + size_t length; + + /* The type of the source. */ + pm_source_type_t type; + + /* Stream-specific data, only used for PM_SOURCE_STREAM sources. */ + struct { + /* The buffer that holds the accumulated stream data. */ + pm_buffer_t *buffer; + + /* The stream object to read from. */ + void *stream; + + /* The function to use to read from the stream. */ + pm_source_stream_fgets_t *fgets; + + /* The function to use to check if the stream is at EOF. */ + pm_source_stream_feof_t *feof; + + /* Whether the stream has reached EOF. */ + bool eof; + } stream; +}; + +/* + * Read from a stream into the source's internal buffer. This is used by + * pm_parse_stream to incrementally read the source. + */ +bool pm_source_stream_read(pm_source_t *source); + +/* + * Returns whether the stream source has reached EOF. + */ +bool pm_source_stream_eof(const pm_source_t *source); + +#endif diff --git a/prism/static_literals.h b/prism/internal/static_literals.h index bd29761899..d59002ac0a 100644 --- a/prism/static_literals.h +++ b/prism/internal/static_literals.h @@ -1,33 +1,25 @@ -/** - * @file static_literals.h - * - * A set of static literal nodes that can be checked for duplicates. - */ -#ifndef PRISM_STATIC_LITERALS_H -#define PRISM_STATIC_LITERALS_H +#ifndef PRISM_INTERNAL_STATIC_LITERALS_H +#define PRISM_INTERNAL_STATIC_LITERALS_H -#include "prism/defines.h" #include "prism/ast.h" -#include "prism/util/pm_newline_list.h" - -#include <assert.h> -#include <stdbool.h> +#include "prism/buffer.h" +#include "prism/line_offset_list.h" -/** +/* * An internal hash table for a set of nodes. */ typedef struct { - /** The array of nodes in the hash table. */ + /* The array of nodes in the hash table. */ pm_node_t **nodes; - /** The size of the hash table. */ + /* The size of the hash table. */ uint32_t size; - /** The space that has been allocated in the hash table. */ + /* The space that has been allocated in the hash table. */ uint32_t capacity; } pm_node_hash_t; -/** +/* * Certain sets of nodes (hash keys and when clauses) check for duplicate nodes * to alert the user of potential issues. To do this, we keep a set of the nodes * that have been seen so far, and compare whenever we find a new node. @@ -36,86 +28,71 @@ typedef struct { * that need to be performed. */ typedef struct { - /** + /* * This is the set of IntegerNode and SourceLineNode instances. */ pm_node_hash_t integer_nodes; - /** + /* * This is the set of FloatNode instances. */ pm_node_hash_t float_nodes; - /** + /* * This is the set of RationalNode and ImaginaryNode instances. */ pm_node_hash_t number_nodes; - /** + /* * This is the set of StringNode and SourceFileNode instances. */ pm_node_hash_t string_nodes; - /** + /* * This is the set of RegularExpressionNode instances. */ pm_node_hash_t regexp_nodes; - /** + /* * This is the set of SymbolNode instances. */ pm_node_hash_t symbol_nodes; - /** + /* * A pointer to the last TrueNode instance that was inserted, or NULL. */ pm_node_t *true_node; - /** + /* * A pointer to the last FalseNode instance that was inserted, or NULL. */ pm_node_t *false_node; - /** + /* * A pointer to the last NilNode instance that was inserted, or NULL. */ pm_node_t *nil_node; - /** + /* * A pointer to the last SourceEncodingNode instance that was inserted, or * NULL. */ pm_node_t *source_encoding_node; } pm_static_literals_t; -/** +/* * Add a node to the set of static literals. - * - * @param newline_list The list of newline offsets to use to calculate lines. - * @param start_line The line number that the parser starts on. - * @param literals The set of static literals to add the node to. - * @param node The node to add to the set. - * @param replace Whether to replace the previous node if one already exists. - * @return A pointer to the node that is being overwritten, if there is one. */ -pm_node_t * pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace); +pm_node_t * pm_static_literals_add(const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace); -/** +/* * Free the internal memory associated with the given static literals set. - * - * @param literals The set of static literals to free. */ void pm_static_literals_free(pm_static_literals_t *literals); -/** +/* * Create a string-based representation of the given static literal. - * - * @param buffer The buffer to write the string to. - * @param newline_list The list of newline offsets to use to calculate lines. - * @param start_line The line number that the parser starts on. - * @param encoding_name The name of the encoding of the source being parsed. - * @param node The node to create a string representation of. */ -void pm_static_literal_inspect(pm_buffer_t *buffer, const pm_newline_list_t *newline_list, int32_t start_line, const char *encoding_name, const pm_node_t *node); +void pm_static_literal_inspect(pm_buffer_t *buffer, const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, const char *encoding_name, const pm_node_t *node); #endif diff --git a/prism/internal/stringy.h b/prism/internal/stringy.h new file mode 100644 index 0000000000..1aaa23ea75 --- /dev/null +++ b/prism/internal/stringy.h @@ -0,0 +1,30 @@ +#ifndef PRISM_INTERNAL_STRINGY_H +#define PRISM_INTERNAL_STRINGY_H + +#include "prism/stringy.h" + +/* + * Defines an empty string. This is useful for initializing a string that will + * be filled in later. + */ +#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 }) + +/* + * Initialize a shared string that is based on initial input. + */ +void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end); + +/* + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + */ +int pm_string_compare(const pm_string_t *left, const pm_string_t *right); + +/* + * Free the associated memory of the given string. + */ +void pm_string_cleanup(pm_string_t *string); + +#endif diff --git a/prism/util/pm_strncasecmp.h b/prism/internal/strncasecmp.h index 5cb88cb5eb..775f6a993e 100644 --- a/prism/util/pm_strncasecmp.h +++ b/prism/internal/strncasecmp.h @@ -1,18 +1,10 @@ -/** - * @file pm_strncasecmp.h - * - * A custom strncasecmp implementation. - */ -#ifndef PRISM_STRNCASECMP_H -#define PRISM_STRNCASECMP_H +#ifndef PRISM_INTERNAL_STRNCASECMP_H +#define PRISM_INTERNAL_STRNCASECMP_H -#include "prism/defines.h" - -#include <ctype.h> #include <stddef.h> #include <stdint.h> -/** +/* * Compare two strings, ignoring case, up to the given length. Returns 0 if the * strings are equal, a negative number if string1 is less than string2, or a * positive number if string1 is greater than string2. @@ -20,12 +12,6 @@ * Note that this is effectively our own implementation of strncasecmp, but it's * not available on all of the platforms we want to support so we're rolling it * here. - * - * @param string1 The first string to compare. - * @param string2 The second string to compare - * @param length The maximum number of characters to compare. - * @return 0 if the strings are equal, a negative number if string1 is less than - * string2, or a positive number if string1 is greater than string2. */ int pm_strncasecmp(const uint8_t *string1, const uint8_t *string2, size_t length); diff --git a/prism/util/pm_strpbrk.h b/prism/internal/strpbrk.h index f387bd5782..d64156c002 100644 --- a/prism/util/pm_strpbrk.h +++ b/prism/internal/strpbrk.h @@ -1,19 +1,15 @@ -/** - * @file pm_strpbrk.h - * - * A custom strpbrk implementation. - */ -#ifndef PRISM_STRPBRK_H -#define PRISM_STRPBRK_H +#ifndef PRISM_INTERNAL_STRPBRK_H +#define PRISM_INTERNAL_STRPBRK_H -#include "prism/defines.h" -#include "prism/diagnostic.h" #include "prism/parser.h" +/* The maximum number of bytes in a strpbrk charset. */ +#define PM_STRPBRK_CACHE_SIZE 16 + #include <stddef.h> -#include <string.h> +#include <stdint.h> -/** +/* * Here we have rolled our own version of strpbrk. The standard library strpbrk * has undefined behavior when the source string is not null-terminated. We want * to support strings that are not null-terminated because pm_parse does not @@ -31,15 +27,6 @@ * characters that are trailing bytes of multi-byte characters. For example, in * Shift-JIS, the backslash character can be a trailing byte. In that case we * need to take a slower path and iterate one multi-byte character at a time. - * - * @param parser The parser. - * @param source The source to search. - * @param charset The charset to search for. - * @param length The maximum number of bytes to search. - * @param validate Whether to validate that the source string is valid in the - * current encoding of the parser. - * @return A pointer to the first character in the source string that is in the - * charset, or NULL if no such character exists. */ const uint8_t * pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate); diff --git a/prism/internal/tokens.h b/prism/internal/tokens.h new file mode 100644 index 0000000000..3a983e54ae --- /dev/null +++ b/prism/internal/tokens.h @@ -0,0 +1,11 @@ +#ifndef PRISM_INTERNAL_TOKENS_H +#define PRISM_INTERNAL_TOKENS_H + +#include "prism/ast.h" + +/* + * Returns the human name of the given token type. + */ +const char * pm_token_str(pm_token_type_t token_type); + +#endif diff --git a/prism/json.h b/prism/json.h new file mode 100644 index 0000000000..11039e7796 --- /dev/null +++ b/prism/json.h @@ -0,0 +1,32 @@ +/** + * @file json.h + */ +#ifndef PRISM_JSON_H +#define PRISM_JSON_H + +#include "prism/excludes.h" + +/* We optionally support dumping to JSON. For systems that don't want or need + * this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. + */ +#ifndef PRISM_EXCLUDE_JSON + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include "prism/ast.h" +#include "prism/buffer.h" +#include "prism/parser.h" + +/** + * Dump JSON to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param parser The parser that parsed the node. + * @param node The node to serialize. + */ +PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node) PRISM_NONNULL(1, 2, 3); + +#endif + +#endif diff --git a/prism/line_offset_list.c b/prism/line_offset_list.c new file mode 100644 index 0000000000..ce217ebd3f --- /dev/null +++ b/prism/line_offset_list.c @@ -0,0 +1,100 @@ +#include "prism/compiler/align.h" +#include "prism/internal/line_offset_list.h" +#include "prism/internal/arena.h" + +#include <assert.h> +#include <string.h> + +/** + * Initialize a new line offset list with the given capacity. + */ +void +pm_line_offset_list_init(pm_arena_t *arena, pm_line_offset_list_t *list, size_t capacity) { + list->offsets = (uint32_t *) pm_arena_alloc(arena, capacity * sizeof(uint32_t), PRISM_ALIGNOF(uint32_t)); + + // The first line always has offset 0. + list->offsets[0] = 0; + list->size = 1; + list->capacity = capacity; +} + +/** + * Clear out the newlines that have been appended to the list. + */ +void +pm_line_offset_list_clear(pm_line_offset_list_t *list) { + list->size = 1; +} + +/** + * Append a new offset to the newline list (slow path: resize and store). + */ +void +pm_line_offset_list_append_slow(pm_arena_t *arena, pm_line_offset_list_t *list, uint32_t cursor) { + size_t new_capacity = (list->capacity * 3) / 2; + uint32_t *new_offsets = (uint32_t *) pm_arena_alloc(arena, new_capacity * sizeof(uint32_t), PRISM_ALIGNOF(uint32_t)); + + memcpy(new_offsets, list->offsets, list->size * sizeof(uint32_t)); + + list->offsets = new_offsets; + list->capacity = new_capacity; + + assert(list->size == 0 || cursor > list->offsets[list->size - 1]); + list->offsets[list->size++] = cursor; +} + +/** + * Returns the line of the given offset. If the offset is not in the list, the + * line of the closest offset less than the given offset is returned. + */ +int32_t +pm_line_offset_list_line(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line) { + size_t left = 0; + size_t right = list->size - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + + if (list->offsets[mid] == cursor) { + return ((int32_t) mid) + start_line; + } + + if (list->offsets[mid] < cursor) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + return ((int32_t) left) + start_line - 1; +} + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + */ +pm_line_column_t +pm_line_offset_list_line_column(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line) { + size_t left = 0; + size_t right = list->size - 1; + + while (left <= right) { + size_t mid = left + (right - left) / 2; + + if (list->offsets[mid] == cursor) { + return ((pm_line_column_t) { ((int32_t) mid) + start_line, 0 }); + } + + if (list->offsets[mid] < cursor) { + left = mid + 1; + } else { + right = mid - 1; + } + } + + return ((pm_line_column_t) { + .line = ((int32_t) left) + start_line - 1, + .column = cursor - list->offsets[left - 1] + }); +} diff --git a/prism/line_offset_list.h b/prism/line_offset_list.h new file mode 100644 index 0000000000..848bc49139 --- /dev/null +++ b/prism/line_offset_list.h @@ -0,0 +1,61 @@ +/** + * @file line_offset_list.h + * + * A list of byte offsets of newlines in a string. + * + * When compiling the syntax tree, it's necessary to know the line and column + * of many nodes. This is necessary to support things like error messages, + * tracepoints, etc. + * + * It's possible that we could store the start line, start column, end line, and + * end column on every node in addition to the offsets that we already store, + * but that would be quite a lot of memory overhead. + */ +#ifndef PRISM_LINE_OFFSET_LIST_H +#define PRISM_LINE_OFFSET_LIST_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> +#include <stdint.h> + +/** + * A list of offsets of the start of lines in a string. The offsets are assumed + * to be sorted/inserted in ascending order. + */ +typedef struct { + /** The number of offsets in the list. */ + size_t size; + + /** The capacity of the list that has been allocated. */ + size_t capacity; + + /** The list of offsets. */ + uint32_t *offsets; +} pm_line_offset_list_t; + +/** + * A line and column in a string. + */ +typedef struct { + /** The line number. */ + int32_t line; + + /** The column in bytes. */ + uint32_t column; +} pm_line_column_t; + +/** + * Returns the line and column of the given offset. If the offset is not in the + * list, the line and column of the closest offset less than the given offset + * are returned. + * + * @param list The list to search. + * @param cursor The offset to search for. + * @param start_line The line to start counting from. + * @returns The line and column of the given offset. + */ +PRISM_EXPORTED_FUNCTION pm_line_column_t pm_line_offset_list_line_column(const pm_line_offset_list_t *list, uint32_t cursor, int32_t start_line) PRISM_NONNULL(1); + +#endif diff --git a/prism/list.c b/prism/list.c new file mode 100644 index 0000000000..8d4cd1be94 --- /dev/null +++ b/prism/list.c @@ -0,0 +1,24 @@ +#include "prism/internal/list.h" + +/** + * Returns the size of the list. + */ +size_t +pm_list_size(pm_list_t *list) { + return list->size; +} + +/** + * Append a node to the given list. + */ +void +pm_list_append(pm_list_t *list, pm_list_node_t *node) { + if (list->head == NULL) { + list->head = node; + } else { + list->tail->next = node; + } + + list->tail = node; + list->size++; +} diff --git a/prism/magic_comments.h b/prism/magic_comments.h new file mode 100644 index 0000000000..c9d6b600e8 --- /dev/null +++ b/prism/magic_comments.h @@ -0,0 +1,35 @@ +/** + * @file magic_comments.h + * + * Types and functions related to magic comments found during parsing. + */ +#ifndef PRISM_MAGIC_COMMENTS_H +#define PRISM_MAGIC_COMMENTS_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include "prism/ast.h" + +#include <stddef.h> + +/** An opaque pointer to a magic comment found while parsing. */ +typedef struct pm_magic_comment_t pm_magic_comment_t; + +/** + * Returns the location of the key associated with the given magic comment. + * + * @param magic_comment the magic comment whose key location we want to get + * @returns the location of the key associated with the given magic comment + */ +PRISM_EXPORTED_FUNCTION pm_location_t pm_magic_comment_key(const pm_magic_comment_t *magic_comment) PRISM_NONNULL(1); + +/** + * Returns the location of the value associated with the given magic comment. + * + * @param magic_comment the magic comment whose value location we want to get + * @returns the location of the value associated with the given magic comment + */ +PRISM_EXPORTED_FUNCTION pm_location_t pm_magic_comment_value(const pm_magic_comment_t *magic_comment) PRISM_NONNULL(1); + +#endif diff --git a/prism/util/pm_memchr.c b/prism/memchr.c index 7ea20ace6d..900e6245b7 100644 --- a/prism/util/pm_memchr.c +++ b/prism/memchr.c @@ -1,15 +1,19 @@ -#include "prism/util/pm_memchr.h" +#include "prism/internal/memchr.h" -#define PRISM_MEMCHR_TRAILING_BYTE_MINIMUM 0x40 +#include <stdbool.h> +#include <stdint.h> +#include <string.h> + +#define TRAILING_BYTE_MINIMUM 0x40 /** * We need to roll our own memchr to handle cases where the encoding changes and * we need to search for a character in a buffer that could be the trailing byte * of a multibyte character. */ -void * +const void * pm_memchr(const void *memory, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding) { - if (encoding_changed && encoding->multibyte && character >= PRISM_MEMCHR_TRAILING_BYTE_MINIMUM) { + if (encoding_changed && encoding->multibyte && character >= TRAILING_BYTE_MINIMUM) { const uint8_t *source = (const uint8_t *) memory; size_t index = 0; @@ -31,5 +35,3 @@ pm_memchr(const void *memory, int character, size_t number, bool encoding_change return memchr(memory, character, number); } } - -#undef PRISM_MEMCHR_TRAILING_BYTE_MINIMUM diff --git a/prism/node.h b/prism/node.h index e8686a327c..75bc3c9b2d 100644 --- a/prism/node.h +++ b/prism/node.h @@ -6,9 +6,10 @@ #ifndef PRISM_NODE_H #define PRISM_NODE_H -#include "prism/defines.h" -#include "prism/parser.h" -#include "prism/util/pm_buffer.h" +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include "prism/ast.h" /** * Loop through each node in the node list, writing each node to the given @@ -18,51 +19,12 @@ for (size_t index = 0; index < (list)->size && ((node) = (list)->nodes[index]); index++) /** - * Append a new node onto the end of the node list. - * - * @param list The list to append to. - * @param node The node to append. - */ -void pm_node_list_append(pm_node_list_t *list, pm_node_t *node); - -/** - * Prepend a new node onto the beginning of the node list. - * - * @param list The list to prepend to. - * @param node The node to prepend. - */ -void pm_node_list_prepend(pm_node_list_t *list, pm_node_t *node); - -/** - * Concatenate the given node list onto the end of the other node list. - * - * @param list The list to concatenate onto. - * @param other The list to concatenate. - */ -void pm_node_list_concat(pm_node_list_t *list, pm_node_list_t *other); - -/** - * Free the internal memory associated with the given node list. - * - * @param list The list to free. - */ -void pm_node_list_free(pm_node_list_t *list); - -/** - * Deallocate a node and all of its children. - * - * @param parser The parser that owns the node. - * @param node The node to deallocate. - */ -PRISM_EXPORTED_FUNCTION void pm_node_destroy(pm_parser_t *parser, struct pm_node *node); - -/** * Returns a string representation of the given node type. * * @param node_type The node type to convert to a string. - * @return A string representation of the given node type. + * @returns A string representation of the given node type. */ -PRISM_EXPORTED_FUNCTION const char * pm_node_type_to_str(pm_node_type_t node_type); +PRISM_EXPORTED_FUNCTION const char * pm_node_type(pm_node_type_t node_type); /** * Visit each of the nodes in this subtree using the given visitor callback. The @@ -80,7 +42,7 @@ PRISM_EXPORTED_FUNCTION const char * pm_node_type_to_str(pm_node_type_t node_typ * bool visit(const pm_node_t *node, void *data) { * size_t *indent = (size_t *) data; * for (size_t i = 0; i < *indent * 2; i++) putc(' ', stdout); - * printf("%s\n", pm_node_type_to_str(node->type)); + * printf("%s\n", pm_node_type(node->type)); * * size_t next_indent = *indent + 1; * size_t *next_data = &next_indent; @@ -93,18 +55,21 @@ PRISM_EXPORTED_FUNCTION const char * pm_node_type_to_str(pm_node_type_t node_typ * const char *source = "1 + 2; 3 + 4"; * size_t size = strlen(source); * - * pm_parser_t parser; - * pm_options_t options = { 0 }; - * pm_parser_init(&parser, (const uint8_t *) source, size, &options); + * pm_arena_t *arena = pm_arena_new(); + * pm_options_t *options = pm_options_new(); + * + * pm_parser_t *parser = pm_parser_new(arena, (const uint8_t *) source, size, options); * * size_t indent = 0; - * pm_node_t *node = pm_parse(&parser); + * pm_node_t *node = pm_parse(parser); * * size_t *data = &indent; * pm_visit_node(node, visit, data); * - * pm_node_destroy(&parser, node); - * pm_parser_free(&parser); + * pm_parser_free(parser); + * pm_options_free(options); + * pm_arena_free(arena); + * * return EXIT_SUCCESS; * } * ``` @@ -113,7 +78,7 @@ PRISM_EXPORTED_FUNCTION const char * pm_node_type_to_str(pm_node_type_t node_typ * @param visitor The callback to call for each node in the subtree. * @param data An opaque pointer that is passed to the visitor callback. */ -PRISM_EXPORTED_FUNCTION void pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data); +PRISM_EXPORTED_FUNCTION void pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) PRISM_NONNULL(1); /** * Visit the children of the given node with the given callback. This is the @@ -124,6 +89,6 @@ PRISM_EXPORTED_FUNCTION void pm_visit_node(const pm_node_t *node, bool (*visitor * @param visitor The callback to call for each child node. * @param data An opaque pointer that is passed to the visitor callback. */ -PRISM_EXPORTED_FUNCTION void pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data); +PRISM_EXPORTED_FUNCTION void pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) PRISM_NONNULL(1); #endif diff --git a/prism/options.c b/prism/options.c index a457178ce8..b589865a2a 100644 --- a/prism/options.c +++ b/prism/options.c @@ -1,18 +1,78 @@ -#include "prism/options.h" +#include "prism/internal/options.h" + +#include "prism/compiler/inline.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/char.h" +#include "prism/internal/stringy.h" + +#include <stdlib.h> +#include <string.h> + +/** + * Allocate a new options struct. If the options struct cannot be allocated, + * this function aborts the process. + */ +pm_options_t * +pm_options_new(void) { + pm_options_t *options = xcalloc(1, sizeof(pm_options_t)); + if (options == NULL) abort(); + return options; +} + +/** + * Free the internal memory associated with the options. + */ +void +pm_options_cleanup(pm_options_t *options) { + pm_string_cleanup(&options->filepath); + pm_string_cleanup(&options->encoding); + + for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) { + pm_options_scope_t *scope = &options->scopes[scope_index]; + + for (size_t local_index = 0; local_index < scope->locals_count; local_index++) { + pm_string_cleanup(&scope->locals[local_index]); + } + + xfree_sized(scope->locals, scope->locals_count * sizeof(pm_string_t)); + } + + xfree_sized(options->scopes, options->scopes_count * sizeof(pm_options_scope_t)); +} + +/** + * Free both the held memory of the given options struct and the struct itself. + * + * @param options The options struct to free. + */ +void +pm_options_free(pm_options_t *options) { + pm_options_cleanup(options); + xfree_sized(options, sizeof(pm_options_t)); +} /** * Set the shebang callback option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_shebang_callback_set(pm_options_t *options, pm_options_shebang_callback_t shebang_callback, void *shebang_callback_data) { options->shebang_callback = shebang_callback; options->shebang_callback_data = shebang_callback_data; } /** + * Get the filepath option on the given options struct. + */ +const pm_string_t * +pm_options_filepath(const pm_options_t *options) { + return &options->filepath; +} + +/** * Set the filepath option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_filepath_set(pm_options_t *options, const char *filepath) { pm_string_constant_init(&options->filepath, filepath, strlen(filepath)); } @@ -20,7 +80,7 @@ pm_options_filepath_set(pm_options_t *options, const char *filepath) { /** * Set the encoding option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_encoding_set(pm_options_t *options, const char *encoding) { pm_string_constant_init(&options->encoding, encoding, strlen(encoding)); } @@ -28,7 +88,7 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) { /** * Set the encoding_locked option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) { options->encoding_locked = encoding_locked; } @@ -36,7 +96,7 @@ pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) { /** * Set the line option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_line_set(pm_options_t *options, int32_t line) { options->line = line; } @@ -44,7 +104,7 @@ pm_options_line_set(pm_options_t *options, int32_t line) { /** * Set the frozen string literal option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_literal) { options->frozen_string_literal = frozen_string_literal ? PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED : PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED; } @@ -52,7 +112,7 @@ pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_l /** * Sets the command line option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_command_line_set(pm_options_t *options, uint8_t command_line) { options->command_line = command_line; } @@ -60,7 +120,7 @@ pm_options_command_line_set(pm_options_t *options, uint8_t command_line) { /** * Checks if the given slice represents a number. */ -static inline bool +static PRISM_INLINE bool is_number(const char *string, size_t length) { return pm_strspn_decimal_digit((const uint8_t *) string, (ptrdiff_t) length) == length; } @@ -70,7 +130,7 @@ is_number(const char *string, size_t length) { * string. If the string contains an invalid option, this returns false. * Otherwise, it returns true. */ -PRISM_EXPORTED_FUNCTION bool +bool pm_options_version_set(pm_options_t *options, const char *version, size_t length) { if (version == NULL) { options->version = PM_OPTIONS_VERSION_LATEST; @@ -88,33 +148,43 @@ pm_options_version_set(pm_options_t *options, const char *version, size_t length return true; } - if (strncmp(version, "3.5", 3) == 0) { - options->version = PM_OPTIONS_VERSION_LATEST; + if (strncmp(version, "3.5", 3) == 0 || strncmp(version, "4.0", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_0; + return true; + } + + if (strncmp(version, "4.1", 3) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_1; return true; } return false; } - if (length >= 4) { - if (strncmp(version, "3.3.", 4) == 0 && is_number(version + 4, length - 4)) { + if (length >= 4 && is_number(version + 4, length - 4)) { + if (strncmp(version, "3.3.", 4) == 0) { options->version = PM_OPTIONS_VERSION_CRUBY_3_3; return true; } - if (strncmp(version, "3.4.", 4) == 0 && is_number(version + 4, length - 4)) { + if (strncmp(version, "3.4.", 4) == 0) { options->version = PM_OPTIONS_VERSION_CRUBY_3_4; return true; } - if (strncmp(version, "3.5.", 4) == 0 && is_number(version + 4, length - 4)) { - options->version = PM_OPTIONS_VERSION_LATEST; + if (strncmp(version, "3.5.", 4) == 0 || strncmp(version, "4.0.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_0; + return true; + } + + if (strncmp(version, "4.1.", 4) == 0) { + options->version = PM_OPTIONS_VERSION_CRUBY_4_1; return true; } } - if (length >= 6) { - if (strncmp(version, "latest", 7) == 0) { // 7 to compare the \0 as well + if (length == 6) { + if (strncmp(version, "latest", 6) == 0) { options->version = PM_OPTIONS_VERSION_LATEST; return true; } @@ -124,9 +194,27 @@ pm_options_version_set(pm_options_t *options, const char *version, size_t length } /** + * Set the version option on the given options struct to the lowest version of + * Ruby that prism supports. + */ +void +pm_options_version_set_lowest(pm_options_t *options) { + options->version = PM_OPTIONS_VERSION_CRUBY_3_3; +} + +/** + * Set the version option on the given options struct to the highest version of + * Ruby that prism supports. + */ +void +pm_options_version_set_highest(pm_options_t *options) { + options->version = PM_OPTIONS_VERSION_LATEST; +} + +/** * Set the main script option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_main_script_set(pm_options_t *options, bool main_script) { options->main_script = main_script; } @@ -134,15 +222,23 @@ pm_options_main_script_set(pm_options_t *options, bool main_script) { /** * Set the partial script option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_partial_script_set(pm_options_t *options, bool partial_script) { options->partial_script = partial_script; } /** + * Get the freeze option on the given options struct. + */ +bool +pm_options_freeze(const pm_options_t *options) { + return options->freeze; +} + +/** * Set the freeze option on the given options struct. */ -PRISM_EXPORTED_FUNCTION void +void pm_options_freeze_set(pm_options_t *options, bool freeze) { options->freeze = freeze; } @@ -158,7 +254,7 @@ pm_options_freeze_set(pm_options_t *options, bool freeze) { /** * Allocate and zero out the scopes array on the given options struct. */ -PRISM_EXPORTED_FUNCTION bool +bool pm_options_scopes_init(pm_options_t *options, size_t scopes_count) { options->scopes_count = scopes_count; options->scopes = xcalloc(scopes_count, sizeof(pm_options_scope_t)); @@ -166,10 +262,20 @@ pm_options_scopes_init(pm_options_t *options, size_t scopes_count) { } /** - * Return a pointer to the scope at the given index within the given options. + * Return a constant pointer to the scope at the given index within the given + * options. */ -PRISM_EXPORTED_FUNCTION const pm_options_scope_t * -pm_options_scope_get(const pm_options_t *options, size_t index) { +const pm_options_scope_t * +pm_options_scope(const pm_options_t *options, size_t index) { + return &options->scopes[index]; +} + +/** + * Return a mutable pointer to the scope at the given index within the given + * options. + */ +pm_options_scope_t * +pm_options_scope_mut(pm_options_t *options, size_t index) { return &options->scopes[index]; } @@ -177,49 +283,38 @@ pm_options_scope_get(const pm_options_t *options, size_t index) { * Create a new options scope struct. This will hold a set of locals that are in * scope surrounding the code that is being parsed. */ -PRISM_EXPORTED_FUNCTION bool +void pm_options_scope_init(pm_options_scope_t *scope, size_t locals_count) { scope->locals_count = locals_count; scope->locals = xcalloc(locals_count, sizeof(pm_string_t)); scope->forwarding = PM_OPTIONS_SCOPE_FORWARDING_NONE; - return scope->locals != NULL; + if (scope->locals == NULL) abort(); } /** - * Return a pointer to the local at the given index within the given scope. + * Return a constant pointer to the local at the given index within the given + * scope. */ -PRISM_EXPORTED_FUNCTION const pm_string_t * -pm_options_scope_local_get(const pm_options_scope_t *scope, size_t index) { +const pm_string_t * +pm_options_scope_local(const pm_options_scope_t *scope, size_t index) { return &scope->locals[index]; } /** - * Set the forwarding option on the given scope struct. + * Return a mutable pointer to the local at the given index within the given + * scope. */ -PRISM_EXPORTED_FUNCTION void -pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding) { - scope->forwarding = forwarding; +pm_string_t * +pm_options_scope_local_mut(pm_options_scope_t *scope, size_t index) { + return &scope->locals[index]; } /** - * Free the internal memory associated with the options. + * Set the forwarding option on the given scope struct. */ -PRISM_EXPORTED_FUNCTION void -pm_options_free(pm_options_t *options) { - pm_string_free(&options->filepath); - pm_string_free(&options->encoding); - - for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) { - pm_options_scope_t *scope = &options->scopes[scope_index]; - - for (size_t local_index = 0; local_index < scope->locals_count; local_index++) { - pm_string_free(&scope->locals[local_index]); - } - - xfree(scope->locals); - } - - xfree(options->scopes); +void +pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding) { + scope->forwarding = forwarding; } /** @@ -304,10 +399,7 @@ pm_options_read(pm_options_t *options, const char *data) { data += 4; pm_options_scope_t *scope = &options->scopes[scope_index]; - if (!pm_options_scope_init(scope, locals_count)) { - pm_options_free(options); - return; - } + pm_options_scope_init(scope, locals_count); uint8_t forwarding = (uint8_t) *data++; pm_options_scope_forwarding_set(&options->scopes[scope_index], forwarding); diff --git a/prism/options.h b/prism/options.h index 2f64701b0c..0f5d7529b1 100644 --- a/prism/options.h +++ b/prism/options.h @@ -6,16 +6,27 @@ #ifndef PRISM_OPTIONS_H #define PRISM_OPTIONS_H -#include "prism/defines.h" -#include "prism/util/pm_char.h" -#include "prism/util/pm_string.h" +#include "prism/compiler/exported.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include "prism/stringy.h" #include <stdbool.h> #include <stddef.h> -#include <stdint.h> /** - * String literals should be made frozen. + * A scope of locals surrounding the code that is being parsed. + */ +typedef struct pm_options_scope_t pm_options_scope_t; + +/** + * The options that can be passed to the parser. + */ +typedef struct pm_options_t pm_options_t; + +/** + * String literals should not be frozen. */ #define PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED ((int8_t) -1) @@ -26,42 +37,25 @@ #define PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET ((int8_t) 0) /** - * String literals should be made mutable. + * String literals should be made frozen. */ #define PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED ((int8_t) 1) -/** - * A scope of locals surrounding the code that is being parsed. - */ -typedef struct pm_options_scope { - /** The number of locals in the scope. */ - size_t locals_count; - - /** The names of the locals in the scope. */ - pm_string_t *locals; - - /** Flags for the set of forwarding parameters in this scope. */ - uint8_t forwarding; -} pm_options_scope_t; - /** The default value for parameters. */ static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_NONE = 0x0; -/** When the scope is fowarding with the * parameter. */ +/** When the scope is forwarding with the * parameter. */ static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_POSITIONALS = 0x1; -/** When the scope is fowarding with the ** parameter. */ +/** When the scope is forwarding with the ** parameter. */ static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_KEYWORDS = 0x2; -/** When the scope is fowarding with the & parameter. */ +/** When the scope is forwarding with the & parameter. */ static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_BLOCK = 0x4; -/** When the scope is fowarding with the ... parameter. */ +/** When the scope is forwarding with the ... parameter. */ static const uint8_t PM_OPTIONS_SCOPE_FORWARDING_ALL = 0x8; -// Forward declaration needed by the callback typedef. -struct pm_options; - /** * The callback called when additional switches are found in a shebang comment * that need to be processed by the runtime. @@ -74,118 +68,7 @@ struct pm_options; * @param shebang_callback_data Any additional data that should be passed along * to the callback. */ -typedef void (*pm_options_shebang_callback_t)(struct pm_options *options, const uint8_t *source, size_t length, void *shebang_callback_data); - -/** - * The version of Ruby syntax that we should be parsing with. This is used to - * allow consumers to specify which behavior they want in case they need to - * parse in the same way as a specific version of CRuby would have. - */ -typedef enum { - /** The current version of prism. */ - PM_OPTIONS_VERSION_LATEST = 0, - - /** The vendored version of prism in CRuby 3.3.x. */ - PM_OPTIONS_VERSION_CRUBY_3_3 = 1, - - /** The vendored version of prism in CRuby 3.4.x. */ - PM_OPTIONS_VERSION_CRUBY_3_4 = 2 -} pm_options_version_t; - -/** - * The options that can be passed to the parser. - */ -typedef struct pm_options { - /** - * The callback to call when additional switches are found in a shebang - * comment. - */ - pm_options_shebang_callback_t shebang_callback; - - /** - * Any additional data that should be passed along to the shebang callback - * if one was set. - */ - void *shebang_callback_data; - - /** The name of the file that is currently being parsed. */ - pm_string_t filepath; - - /** - * The line within the file that the parse starts on. This value is - * 1-indexed. - */ - int32_t line; - - /** - * The name of the encoding that the source file is in. Note that this must - * correspond to a name that can be found with Encoding.find in Ruby. - */ - pm_string_t encoding; - - /** - * The number of scopes surrounding the code that is being parsed. - */ - size_t scopes_count; - - /** - * The scopes surrounding the code that is being parsed. For most parses - * this will be NULL, but for evals it will be the locals that are in scope - * surrounding the eval. Scopes are ordered from the outermost scope to the - * innermost one. - */ - pm_options_scope_t *scopes; - - /** - * The version of prism that we should be parsing with. This is used to - * allow consumers to specify which behavior they want in case they need to - * parse exactly as a specific version of CRuby. - */ - pm_options_version_t version; - - /** A bitset of the various options that were set on the command line. */ - uint8_t command_line; - - /** - * Whether or not the frozen string literal option has been set. - * May be: - * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET - */ - int8_t frozen_string_literal; - - /** - * Whether or not the encoding magic comments should be respected. This is a - * niche use-case where you want to parse a file with a specific encoding - * but ignore any encoding magic comments at the top of the file. - */ - bool encoding_locked; - - /** - * When the file being parsed is the main script, the shebang will be - * considered for command-line flags (or for implicit -x). The caller needs - * to pass this information to the parser so that it can behave correctly. - */ - bool main_script; - - /** - * When the file being parsed is considered a "partial" script, jumps will - * not be marked as errors if they are not contained within loops/blocks. - * This is used in the case that you're parsing a script that you know will - * be embedded inside another script later, but you do not have that context - * yet. For example, when parsing an ERB template that will be evaluated - * inside another script. - */ - bool partial_script; - - /** - * Whether or not the parser should freeze the nodes that it creates. This - * makes it possible to have a deeply frozen AST that is safe to share - * between concurrency primitives. - */ - bool freeze; -} pm_options_t; +typedef void (*pm_options_shebang_callback_t)(pm_options_t *options, const uint8_t *source, size_t length, void *shebang_callback_data); /** * A bit representing whether or not the command line -a option was set. -a @@ -220,11 +103,27 @@ static const uint8_t PM_OPTIONS_COMMAND_LINE_P = 0x10; /** * A bit representing whether or not the command line -x option was set. -x - * searches the input file for a shebang that matches the current Ruby engine. + * searches the input file for a shebang that includes "ruby". */ static const uint8_t PM_OPTIONS_COMMAND_LINE_X = 0x20; /** + * Allocate a new options struct. If the options struct cannot be allocated, + * this function aborts the process. + * + * @returns A new options struct with default values. It is the responsibility + * of the caller to free this struct using pm_options_free(). + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_options_t * pm_options_new(void); + +/** + * Free both the held memory of the given options struct and the struct itself. + * + * @param options The options struct to free. + */ +PRISM_EXPORTED_FUNCTION void pm_options_free(pm_options_t *options) PRISM_NONNULL(1); + +/** * Set the shebang callback option on the given options struct. * * @param options The options struct to set the shebang callback on. @@ -232,7 +131,15 @@ static const uint8_t PM_OPTIONS_COMMAND_LINE_X = 0x20; * @param shebang_callback_data Any additional data that should be passed along * to the callback. */ -PRISM_EXPORTED_FUNCTION void pm_options_shebang_callback_set(pm_options_t *options, pm_options_shebang_callback_t shebang_callback, void *shebang_callback_data); +PRISM_EXPORTED_FUNCTION void pm_options_shebang_callback_set(pm_options_t *options, pm_options_shebang_callback_t shebang_callback, void *shebang_callback_data) PRISM_NONNULL(1); + +/** + * Get the filepath option on the given options struct. + * + * @param options The options struct to get the filepath from. + * @returns The filepath. + */ +PRISM_EXPORTED_FUNCTION const pm_string_t * pm_options_filepath(const pm_options_t *options) PRISM_NONNULL(1); /** * Set the filepath option on the given options struct. @@ -240,7 +147,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_shebang_callback_set(pm_options_t *optio * @param options The options struct to set the filepath on. * @param filepath The filepath to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_filepath_set(pm_options_t *options, const char *filepath); +PRISM_EXPORTED_FUNCTION void pm_options_filepath_set(pm_options_t *options, const char *filepath) PRISM_NONNULL(1); /** * Set the line option on the given options struct. @@ -248,7 +155,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_filepath_set(pm_options_t *options, cons * @param options The options struct to set the line on. * @param line The line to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t line); +PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t line) PRISM_NONNULL(1); /** * Set the encoding option on the given options struct. @@ -256,7 +163,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t * @param options The options struct to set the encoding on. * @param encoding The encoding to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding); +PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding) PRISM_NONNULL(1); /** * Set the encoding_locked option on the given options struct. @@ -264,7 +171,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, cons * @param options The options struct to set the encoding_locked value on. * @param encoding_locked The encoding_locked value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked); +PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) PRISM_NONNULL(1); /** * Set the frozen string literal option on the given options struct. @@ -272,7 +179,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *option * @param options The options struct to set the frozen string literal value on. * @param frozen_string_literal The frozen string literal value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_literal); +PRISM_EXPORTED_FUNCTION void pm_options_frozen_string_literal_set(pm_options_t *options, bool frozen_string_literal) PRISM_NONNULL(1); /** * Sets the command line option on the given options struct. @@ -280,7 +187,7 @@ PRISM_EXPORTED_FUNCTION void pm_options_frozen_string_literal_set(pm_options_t * * @param options The options struct to set the command line option on. * @param command_line The command_line value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_command_line_set(pm_options_t *options, uint8_t command_line); +PRISM_EXPORTED_FUNCTION void pm_options_command_line_set(pm_options_t *options, uint8_t command_line) PRISM_NONNULL(1); /** * Set the version option on the given options struct by parsing the given @@ -290,9 +197,25 @@ PRISM_EXPORTED_FUNCTION void pm_options_command_line_set(pm_options_t *options, * @param options The options struct to set the version on. * @param version The version to set. * @param length The length of the version string. - * @return Whether or not the version was parsed successfully. + * @returns Whether or not the version was parsed successfully. */ -PRISM_EXPORTED_FUNCTION bool pm_options_version_set(pm_options_t *options, const char *version, size_t length); +PRISM_EXPORTED_FUNCTION bool pm_options_version_set(pm_options_t *options, const char *version, size_t length) PRISM_NONNULL(1); + +/** + * Set the version option on the given options struct to the lowest version of + * Ruby that prism supports. + * + * @param options The options struct to set the version on. + */ +PRISM_EXPORTED_FUNCTION void pm_options_version_set_lowest(pm_options_t *options) PRISM_NONNULL(1); + +/** + * Set the version option on the given options struct to the highest version of + * Ruby that prism supports. + * + * @param options The options struct to set the version on. + */ +PRISM_EXPORTED_FUNCTION void pm_options_version_set_highest(pm_options_t *options) PRISM_NONNULL(1); /** * Set the main script option on the given options struct. @@ -300,7 +223,7 @@ PRISM_EXPORTED_FUNCTION bool pm_options_version_set(pm_options_t *options, const * @param options The options struct to set the main script value on. * @param main_script The main script value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_main_script_set(pm_options_t *options, bool main_script); +PRISM_EXPORTED_FUNCTION void pm_options_main_script_set(pm_options_t *options, bool main_script) PRISM_NONNULL(1); /** * Set the partial script option on the given options struct. @@ -308,7 +231,15 @@ PRISM_EXPORTED_FUNCTION void pm_options_main_script_set(pm_options_t *options, b * @param options The options struct to set the partial script value on. * @param partial_script The partial script value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_partial_script_set(pm_options_t *options, bool partial_script); +PRISM_EXPORTED_FUNCTION void pm_options_partial_script_set(pm_options_t *options, bool partial_script) PRISM_NONNULL(1); + +/** + * Get the freeze option on the given options struct. + * + * @param options The options struct to get the freeze value from. + * @returns The freeze value. + */ +PRISM_EXPORTED_FUNCTION bool pm_options_freeze(const pm_options_t *options) PRISM_NONNULL(1); /** * Set the freeze option on the given options struct. @@ -316,127 +247,73 @@ PRISM_EXPORTED_FUNCTION void pm_options_partial_script_set(pm_options_t *options * @param options The options struct to set the freeze value on. * @param freeze The freeze value to set. */ -PRISM_EXPORTED_FUNCTION void pm_options_freeze_set(pm_options_t *options, bool freeze); +PRISM_EXPORTED_FUNCTION void pm_options_freeze_set(pm_options_t *options, bool freeze) PRISM_NONNULL(1); /** * Allocate and zero out the scopes array on the given options struct. * * @param options The options struct to initialize the scopes array on. * @param scopes_count The number of scopes to allocate. - * @return Whether or not the scopes array was initialized successfully. + * @returns Whether or not the scopes array was initialized successfully. + */ +PRISM_EXPORTED_FUNCTION bool pm_options_scopes_init(pm_options_t *options, size_t scopes_count) PRISM_NONNULL(1); + +/** + * Return a constant pointer to the scope at the given index within the given + * options. + * + * @param options The options struct to get the scope from. + * @param index The index of the scope to get. + * @returns A constant pointer to the scope at the given index. */ -PRISM_EXPORTED_FUNCTION bool pm_options_scopes_init(pm_options_t *options, size_t scopes_count); +PRISM_EXPORTED_FUNCTION const pm_options_scope_t * pm_options_scope(const pm_options_t *options, size_t index) PRISM_NONNULL(1); /** - * Return a pointer to the scope at the given index within the given options. + * Return a mutable pointer to the scope at the given index within the given + * options. * * @param options The options struct to get the scope from. * @param index The index of the scope to get. - * @return A pointer to the scope at the given index. + * @returns A mutable pointer to the scope at the given index. */ -PRISM_EXPORTED_FUNCTION const pm_options_scope_t * pm_options_scope_get(const pm_options_t *options, size_t index); +PRISM_EXPORTED_FUNCTION pm_options_scope_t * pm_options_scope_mut(pm_options_t *options, size_t index) PRISM_NONNULL(1); /** * Create a new options scope struct. This will hold a set of locals that are in - * scope surrounding the code that is being parsed. + * scope surrounding the code that is being parsed. If the scope was unable to + * allocate its locals, this function will abort the process. * * @param scope The scope struct to initialize. * @param locals_count The number of locals to allocate. - * @return Whether or not the scope was initialized successfully. */ -PRISM_EXPORTED_FUNCTION bool pm_options_scope_init(pm_options_scope_t *scope, size_t locals_count); +PRISM_EXPORTED_FUNCTION void pm_options_scope_init(pm_options_scope_t *scope, size_t locals_count) PRISM_NONNULL(1); /** - * Return a pointer to the local at the given index within the given scope. + * Return a constant pointer to the local at the given index within the given + * scope. * * @param scope The scope struct to get the local from. * @param index The index of the local to get. - * @return A pointer to the local at the given index. - */ -PRISM_EXPORTED_FUNCTION const pm_string_t * pm_options_scope_local_get(const pm_options_scope_t *scope, size_t index); - -/** - * Set the forwarding option on the given scope struct. - * - * @param scope The scope struct to set the forwarding on. - * @param forwarding The forwarding value to set. + * @returns A constant pointer to the local at the given index. */ -PRISM_EXPORTED_FUNCTION void pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding); +PRISM_EXPORTED_FUNCTION const pm_string_t * pm_options_scope_local(const pm_options_scope_t *scope, size_t index) PRISM_NONNULL(1); /** - * Free the internal memory associated with the options. + * Return a mutable pointer to the local at the given index within the given + * scope. * - * @param options The options struct whose internal memory should be freed. + * @param scope The scope struct to get the local from. + * @param index The index of the local to get. + * @returns A mutable pointer to the local at the given index. */ -PRISM_EXPORTED_FUNCTION void pm_options_free(pm_options_t *options); +PRISM_EXPORTED_FUNCTION pm_string_t * pm_options_scope_local_mut(pm_options_scope_t *scope, size_t index) PRISM_NONNULL(1); /** - * Deserialize an options struct from the given binary string. This is used to - * pass options to the parser from an FFI call so that consumers of the library - * from an FFI perspective don't have to worry about the structure of our - * options structs. Since the source of these calls will be from Ruby - * implementation internals we assume it is from a trusted source. - * - * `data` is assumed to be a valid pointer pointing to well-formed data. The - * layout of this data should be the same every time, and is described below: - * - * | # bytes | field | - * | ------- | -------------------------- | - * | `4` | the length of the filepath | - * | ... | the filepath bytes | - * | `4` | the line number | - * | `4` | the length the encoding | - * | ... | the encoding bytes | - * | `1` | frozen string literal | - * | `1` | -p command line option | - * | `1` | -n command line option | - * | `1` | -l command line option | - * | `1` | -a command line option | - * | `1` | the version | - * | `1` | encoding locked | - * | `1` | main script | - * | `1` | partial script | - * | `1` | freeze | - * | `4` | the number of scopes | - * | ... | the scopes | - * - * The version field is an enum, so it should be one of the following values: - * - * | value | version | - * | ----- | ------------------------- | - * | `0` | use the latest version of prism | - * | `1` | use the version of prism that is vendored in CRuby 3.3.0 | - * - * Each scope is laid out as follows: - * - * | # bytes | field | - * | ------- | -------------------------- | - * | `4` | the number of locals | - * | `1` | the forwarding flags | - * | ... | the locals | - * - * Each local is laid out as follows: - * - * | # bytes | field | - * | ------- | -------------------------- | - * | `4` | the length of the local | - * | ... | the local bytes | - * - * Some additional things to note about this layout: - * - * * The filepath can have a length of 0, in which case we'll consider it an - * empty string. - * * The line number should be 0-indexed. - * * The encoding can have a length of 0, in which case we'll use the default - * encoding (UTF-8). If it's not 0, it should correspond to a name of an - * encoding that can be passed to `Encoding.find` in Ruby. - * * The frozen string literal, encoding locked, main script, and partial script - * fields are booleans, so their values should be either 0 or 1. - * * The number of scopes can be 0. + * Set the forwarding option on the given scope struct. * - * @param options The options struct to deserialize into. - * @param data The binary string to deserialize from. + * @param scope The scope struct to set the forwarding on. + * @param forwarding The forwarding value to set. */ -void pm_options_read(pm_options_t *options, const char *data); +PRISM_EXPORTED_FUNCTION void pm_options_scope_forwarding_set(pm_options_scope_t *scope, uint8_t forwarding) PRISM_NONNULL(1); #endif diff --git a/prism/pack.c b/prism/pack.c deleted file mode 100644 index 1388ca8a3b..0000000000 --- a/prism/pack.c +++ /dev/null @@ -1,509 +0,0 @@ -#include "prism/pack.h" - -// We optionally support parsing String#pack templates. For systems that don't -// want or need this functionality, it can be turned off with the -// PRISM_EXCLUDE_PACK define. -#ifdef PRISM_EXCLUDE_PACK - -void pm_pack_parse(void) {} - -#else - -#include <stdbool.h> -#include <errno.h> - -static uintmax_t -strtoumaxc(const char **format) { - uintmax_t value = 0; - while (**format >= '0' && **format <= '9') { - if (value > UINTMAX_MAX / 10) { - errno = ERANGE; - } - value = value * 10 + ((uintmax_t) (**format - '0')); - (*format)++; - } - return value; -} - -PRISM_EXPORTED_FUNCTION pm_pack_result -pm_pack_parse( - pm_pack_variant variant, - const char **format, - const char *format_end, - pm_pack_type *type, - pm_pack_signed *signed_type, - pm_pack_endian *endian, - pm_pack_size *size, - pm_pack_length_type *length_type, - uint64_t *length, - pm_pack_encoding *encoding -) { - if (*encoding == PM_PACK_ENCODING_START) { - *encoding = PM_PACK_ENCODING_US_ASCII; - } - - if (*format == format_end) { - *type = PM_PACK_END; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - *length_type = PM_PACK_LENGTH_NA; - return PM_PACK_OK; - } - - *length_type = PM_PACK_LENGTH_FIXED; - *length = 1; - bool length_changed_allowed = true; - - char directive = **format; - (*format)++; - switch (directive) { - case ' ': - case '\t': - case '\n': - case '\v': - case '\f': - case '\r': - *type = PM_PACK_SPACE; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - *length_type = PM_PACK_LENGTH_NA; - *length = 0; - return PM_PACK_OK; - case '#': - while ((*format < format_end) && (**format != '\n')) { - (*format)++; - } - *type = PM_PACK_COMMENT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - *length_type = PM_PACK_LENGTH_NA; - *length = 0; - return PM_PACK_OK; - case 'C': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_AGNOSTIC_ENDIAN; - *size = PM_PACK_SIZE_8; - break; - case 'S': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_16; - break; - case 'L': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_32; - break; - case 'Q': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_64; - break; - case 'J': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_P; - break; - case 'c': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_AGNOSTIC_ENDIAN; - *size = PM_PACK_SIZE_8; - break; - case 's': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_16; - break; - case 'l': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_32; - break; - case 'q': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_64; - break; - case 'j': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_P; - break; - case 'I': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_INT; - break; - case 'i': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_SIGNED; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_INT; - break; - case 'n': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_BIG_ENDIAN; - *size = PM_PACK_SIZE_16; - length_changed_allowed = false; - break; - case 'N': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_BIG_ENDIAN; - *size = PM_PACK_SIZE_32; - length_changed_allowed = false; - break; - case 'v': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_LITTLE_ENDIAN; - *size = PM_PACK_SIZE_16; - length_changed_allowed = false; - break; - case 'V': - *type = PM_PACK_INTEGER; - *signed_type = PM_PACK_UNSIGNED; - *endian = PM_PACK_LITTLE_ENDIAN; - *size = PM_PACK_SIZE_32; - length_changed_allowed = false; - break; - case 'U': - *type = PM_PACK_UTF8; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'w': - *type = PM_PACK_BER; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'D': - case 'd': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_64; - break; - case 'F': - case 'f': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_NATIVE_ENDIAN; - *size = PM_PACK_SIZE_32; - break; - case 'E': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_LITTLE_ENDIAN; - *size = PM_PACK_SIZE_64; - break; - case 'e': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_LITTLE_ENDIAN; - *size = PM_PACK_SIZE_32; - break; - case 'G': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_BIG_ENDIAN; - *size = PM_PACK_SIZE_64; - break; - case 'g': - *type = PM_PACK_FLOAT; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_BIG_ENDIAN; - *size = PM_PACK_SIZE_32; - break; - case 'A': - *type = PM_PACK_STRING_SPACE_PADDED; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'a': - *type = PM_PACK_STRING_NULL_PADDED; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'Z': - *type = PM_PACK_STRING_NULL_TERMINATED; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'B': - *type = PM_PACK_STRING_MSB; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'b': - *type = PM_PACK_STRING_LSB; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'H': - *type = PM_PACK_STRING_HEX_HIGH; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'h': - *type = PM_PACK_STRING_HEX_LOW; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'u': - *type = PM_PACK_STRING_UU; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'M': - *type = PM_PACK_STRING_MIME; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'm': - *type = PM_PACK_STRING_BASE64; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'P': - *type = PM_PACK_STRING_FIXED; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'p': - *type = PM_PACK_STRING_POINTER; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case '@': - *type = PM_PACK_MOVE; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'X': - *type = PM_PACK_BACK; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case 'x': - *type = PM_PACK_NULL; - *signed_type = PM_PACK_SIGNED_NA; - *endian = PM_PACK_ENDIAN_NA; - *size = PM_PACK_SIZE_NA; - break; - case '%': - return PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE; - default: - return PM_PACK_ERROR_UNKNOWN_DIRECTIVE; - } - - bool explicit_endian = false; - - while (*format < format_end) { - switch (**format) { - case '_': - case '!': - (*format)++; - if (*type != PM_PACK_INTEGER || !length_changed_allowed) { - return PM_PACK_ERROR_BANG_NOT_ALLOWED; - } - switch (*size) { - case PM_PACK_SIZE_SHORT: - case PM_PACK_SIZE_INT: - case PM_PACK_SIZE_LONG: - case PM_PACK_SIZE_LONG_LONG: - break; - case PM_PACK_SIZE_16: - *size = PM_PACK_SIZE_SHORT; - break; - case PM_PACK_SIZE_32: - *size = PM_PACK_SIZE_LONG; - break; - case PM_PACK_SIZE_64: - *size = PM_PACK_SIZE_LONG_LONG; - break; - case PM_PACK_SIZE_P: - break; - default: - return PM_PACK_ERROR_BANG_NOT_ALLOWED; - } - break; - case '<': - (*format)++; - if (explicit_endian) { - return PM_PACK_ERROR_DOUBLE_ENDIAN; - } - *endian = PM_PACK_LITTLE_ENDIAN; - explicit_endian = true; - break; - case '>': - (*format)++; - if (explicit_endian) { - return PM_PACK_ERROR_DOUBLE_ENDIAN; - } - *endian = PM_PACK_BIG_ENDIAN; - explicit_endian = true; - break; - default: - goto exit_modifier_loop; - } - } - -exit_modifier_loop: - - if (variant == PM_PACK_VARIANT_UNPACK && *type == PM_PACK_MOVE) { - *length = 0; - } - - if (*format < format_end) { - if (**format == '*') { - switch (*type) { - case PM_PACK_NULL: - case PM_PACK_BACK: - switch (variant) { - case PM_PACK_VARIANT_PACK: - *length_type = PM_PACK_LENGTH_FIXED; - break; - case PM_PACK_VARIANT_UNPACK: - *length_type = PM_PACK_LENGTH_MAX; - break; - } - *length = 0; - break; - - case PM_PACK_MOVE: - switch (variant) { - case PM_PACK_VARIANT_PACK: - *length_type = PM_PACK_LENGTH_FIXED; - break; - case PM_PACK_VARIANT_UNPACK: - *length_type = PM_PACK_LENGTH_RELATIVE; - break; - } - *length = 0; - break; - - case PM_PACK_STRING_UU: - *length_type = PM_PACK_LENGTH_FIXED; - *length = 0; - break; - - case PM_PACK_STRING_FIXED: - switch (variant) { - case PM_PACK_VARIANT_PACK: - *length_type = PM_PACK_LENGTH_FIXED; - *length = 1; - break; - case PM_PACK_VARIANT_UNPACK: - *length_type = PM_PACK_LENGTH_MAX; - *length = 0; - break; - } - break; - - case PM_PACK_STRING_MIME: - case PM_PACK_STRING_BASE64: - *length_type = PM_PACK_LENGTH_FIXED; - *length = 1; - break; - - default: - *length_type = PM_PACK_LENGTH_MAX; - *length = 0; - break; - } - - (*format)++; - } else if (**format >= '0' && **format <= '9') { - errno = 0; - *length_type = PM_PACK_LENGTH_FIXED; - #if UINTMAX_MAX < UINT64_MAX - #error "prism's design assumes uintmax_t is at least as large as uint64_t" - #endif - uintmax_t length_max = strtoumaxc(format); - if (errno || length_max > UINT64_MAX) { - return PM_PACK_ERROR_LENGTH_TOO_BIG; - } - *length = (uint64_t) length_max; - } - } - - switch (*type) { - case PM_PACK_UTF8: - /* if encoding is US-ASCII, upgrade to UTF-8 */ - if (*encoding == PM_PACK_ENCODING_US_ASCII) { - *encoding = PM_PACK_ENCODING_UTF_8; - } - break; - case PM_PACK_STRING_MIME: - case PM_PACK_STRING_BASE64: - case PM_PACK_STRING_UU: - /* keep US-ASCII (do nothing) */ - break; - default: - /* fall back to BINARY */ - *encoding = PM_PACK_ENCODING_ASCII_8BIT; - break; - } - - return PM_PACK_OK; -} - -PRISM_EXPORTED_FUNCTION size_t -pm_size_to_native(pm_pack_size size) { - switch (size) { - case PM_PACK_SIZE_SHORT: - return sizeof(short); - case PM_PACK_SIZE_INT: - return sizeof(int); - case PM_PACK_SIZE_LONG: - return sizeof(long); - case PM_PACK_SIZE_LONG_LONG: - return sizeof(long long); - case PM_PACK_SIZE_8: - return 1; - case PM_PACK_SIZE_16: - return 2; - case PM_PACK_SIZE_32: - return 4; - case PM_PACK_SIZE_64: - return 8; - case PM_PACK_SIZE_P: - return sizeof(void *); - default: - return 0; - } -} - -#endif diff --git a/prism/pack.h b/prism/pack.h deleted file mode 100644 index 0b0b4b19cc..0000000000 --- a/prism/pack.h +++ /dev/null @@ -1,163 +0,0 @@ -/** - * @file pack.h - * - * A pack template string parser. - */ -#ifndef PRISM_PACK_H -#define PRISM_PACK_H - -#include "prism/defines.h" - -// We optionally support parsing String#pack templates. For systems that don't -// want or need this functionality, it can be turned off with the -// PRISM_EXCLUDE_PACK define. -#ifdef PRISM_EXCLUDE_PACK - -void pm_pack_parse(void); - -#else - -#include <stdint.h> -#include <stdlib.h> - -/** The version of the pack template language that we are parsing. */ -typedef enum pm_pack_version { - PM_PACK_VERSION_3_2_0 -} pm_pack_version; - -/** The type of pack template we are parsing. */ -typedef enum pm_pack_variant { - PM_PACK_VARIANT_PACK, - PM_PACK_VARIANT_UNPACK -} pm_pack_variant; - -/** A directive within the pack template. */ -typedef enum pm_pack_type { - PM_PACK_SPACE, - PM_PACK_COMMENT, - PM_PACK_INTEGER, - PM_PACK_UTF8, - PM_PACK_BER, - PM_PACK_FLOAT, - PM_PACK_STRING_SPACE_PADDED, - PM_PACK_STRING_NULL_PADDED, - PM_PACK_STRING_NULL_TERMINATED, - PM_PACK_STRING_MSB, - PM_PACK_STRING_LSB, - PM_PACK_STRING_HEX_HIGH, - PM_PACK_STRING_HEX_LOW, - PM_PACK_STRING_UU, - PM_PACK_STRING_MIME, - PM_PACK_STRING_BASE64, - PM_PACK_STRING_FIXED, - PM_PACK_STRING_POINTER, - PM_PACK_MOVE, - PM_PACK_BACK, - PM_PACK_NULL, - PM_PACK_END -} pm_pack_type; - -/** The signness of a pack directive. */ -typedef enum pm_pack_signed { - PM_PACK_UNSIGNED, - PM_PACK_SIGNED, - PM_PACK_SIGNED_NA -} pm_pack_signed; - -/** The endianness of a pack directive. */ -typedef enum pm_pack_endian { - PM_PACK_AGNOSTIC_ENDIAN, - PM_PACK_LITTLE_ENDIAN, // aka 'VAX', or 'V' - PM_PACK_BIG_ENDIAN, // aka 'network', or 'N' - PM_PACK_NATIVE_ENDIAN, - PM_PACK_ENDIAN_NA -} pm_pack_endian; - -/** The size of an integer pack directive. */ -typedef enum pm_pack_size { - PM_PACK_SIZE_SHORT, - PM_PACK_SIZE_INT, - PM_PACK_SIZE_LONG, - PM_PACK_SIZE_LONG_LONG, - PM_PACK_SIZE_8, - PM_PACK_SIZE_16, - PM_PACK_SIZE_32, - PM_PACK_SIZE_64, - PM_PACK_SIZE_P, - PM_PACK_SIZE_NA -} pm_pack_size; - -/** The type of length of a pack directive. */ -typedef enum pm_pack_length_type { - PM_PACK_LENGTH_FIXED, - PM_PACK_LENGTH_MAX, - PM_PACK_LENGTH_RELATIVE, // special case for unpack @* - PM_PACK_LENGTH_NA -} pm_pack_length_type; - -/** The type of encoding for a pack template string. */ -typedef enum pm_pack_encoding { - PM_PACK_ENCODING_START, - PM_PACK_ENCODING_ASCII_8BIT, - PM_PACK_ENCODING_US_ASCII, - PM_PACK_ENCODING_UTF_8 -} pm_pack_encoding; - -/** The result of parsing a pack template. */ -typedef enum pm_pack_result { - PM_PACK_OK, - PM_PACK_ERROR_UNSUPPORTED_DIRECTIVE, - PM_PACK_ERROR_UNKNOWN_DIRECTIVE, - PM_PACK_ERROR_LENGTH_TOO_BIG, - PM_PACK_ERROR_BANG_NOT_ALLOWED, - PM_PACK_ERROR_DOUBLE_ENDIAN -} pm_pack_result; - -/** - * Parse a single directive from a pack or unpack format string. - * - * @param variant (in) pack or unpack - * @param format (in, out) the start of the next directive to parse on calling, - * and advanced beyond the parsed directive on return, or as much of it as - * was consumed until an error was encountered - * @param format_end (in) the end of the format string - * @param type (out) the type of the directive - * @param signed_type (out) whether the value is signed - * @param endian (out) the endianness of the value - * @param size (out) the size of the value - * @param length_type (out) what kind of length is specified - * @param length (out) the length of the directive - * @param encoding (in, out) takes the current encoding of the string which - * would result from parsing the whole format string, and returns a possibly - * changed directive - the encoding should be `PM_PACK_ENCODING_START` when - * pm_pack_parse is called for the first directive in a format string - * - * @return `PM_PACK_OK` on success or `PM_PACK_ERROR_*` on error - * @note Consult Ruby documentation for the meaning of directives. - */ -PRISM_EXPORTED_FUNCTION pm_pack_result -pm_pack_parse( - pm_pack_variant variant, - const char **format, - const char *format_end, - pm_pack_type *type, - pm_pack_signed *signed_type, - pm_pack_endian *endian, - pm_pack_size *size, - pm_pack_length_type *length_type, - uint64_t *length, - pm_pack_encoding *encoding -); - -/** - * Prism abstracts sizes away from the native system - this converts an abstract - * size to a native size. - * - * @param size The abstract size to convert. - * @return The native size. - */ -PRISM_EXPORTED_FUNCTION size_t pm_size_to_native(pm_pack_size size); - -#endif - -#endif diff --git a/prism/parser.c b/prism/parser.c new file mode 100644 index 0000000000..415cd31984 --- /dev/null +++ b/prism/parser.c @@ -0,0 +1,302 @@ +#include "prism/internal/parser.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/comments.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/magic_comments.h" + +#include <stdlib.h> + +/** + * Register a callback that will be called whenever prism changes the encoding + * it is using to parse based on the magic comment. + */ +void +pm_parser_encoding_changed_callback_set(pm_parser_t *parser, pm_encoding_changed_callback_t callback) { + parser->encoding_changed_callback = callback; +} + +/** + * Register a callback that will be called whenever a token is lexed. + */ +void +pm_parser_lex_callback_set(pm_parser_t *parser, pm_lex_callback_t callback, void *data) { + parser->lex_callback.callback = callback; + parser->lex_callback.data = data; +} + +/** + * Returns the opaque data that is passed to the lex callback when it is called. + */ +void * +pm_parser_lex_callback_data(const pm_parser_t *parser) { + return parser->lex_callback.data; +} + +/** + * Returns the raw pointer to the start of the source that is being parsed. + */ +const uint8_t * +pm_parser_start(const pm_parser_t *parser) { + return parser->start; +} + +/** + * Returns the raw pointer to the end of the source that is being parsed. + */ +const uint8_t * +pm_parser_end(const pm_parser_t *parser) { + return parser->end; +} + +/** + * Returns the line that the parser was considered to have started on. + * + * @param parser the parser whose start line we want to get + * @return the line that the parser was considered to have started on + */ +int32_t +pm_parser_start_line(const pm_parser_t *parser) { + return parser->start_line; +} + +/** + * Returns the name of the encoding that is being used to parse the source. + */ +const char * +pm_parser_encoding_name(const pm_parser_t *parser) { + return parser->encoding->name; +} + +/** + * Returns the width of the character at the given pointer in the encoding that + * is being used to parse the source. + */ +size_t +pm_parser_encoding_char_width(const pm_parser_t *parser, const uint8_t *start, ptrdiff_t remaining) { + return parser->encoding->char_width(start, remaining); +} + +/** + * Returns whether or not the parser is using the US-ASCII encoding. + */ +bool +pm_parser_encoding_us_ascii(const pm_parser_t *parser) { + return parser->encoding == PM_ENCODING_US_ASCII_ENTRY; +} + +/** + * Returns the filepath that is being used to parse the source. + */ +const pm_string_t * +pm_parser_filepath(const pm_parser_t *parser) { + return &parser->filepath; +} + +/** + * Find a constant in the parser's constant pool. Returns the id of the + * constant, or 0 if the constant is not found. + */ +pm_constant_id_t +pm_parser_constant_find(const pm_parser_t *parser, const uint8_t *start, size_t length) { + return pm_constant_pool_find(&parser->constant_pool, start, length); +} + +/** + * Returns the frozen string literal value of the parser. + */ +int8_t +pm_parser_frozen_string_literal(const pm_parser_t *parser) { + return parser->frozen_string_literal; +} + +/** + * Returns the line offsets that are associated with the given parser. + * + * @param parser the parser whose line offsets we want to get + * @return the line offsets that are associated with the given parser + */ +const pm_line_offset_list_t * +pm_parser_line_offsets(const pm_parser_t *parser) { + return &parser->line_offsets; +} + +/** + * Returns the location of the __DATA__ section that is associated with the + * given parser, if it exists. + */ +const pm_location_t * +pm_parser_data_loc(const pm_parser_t *parser) { + return &parser->data_loc; +} + +/** + * Returns whether the given parser is continuable, meaning that it could become + * valid if more input were appended, as opposed to being definitively invalid. + */ +bool +pm_parser_continuable(const pm_parser_t *parser) { + return parser->continuable; +} + +/** + * Returns the lex state of the parser. Note that this is an internal detail, + * and we are purposefully not returning an instance of the internal enum that + * we use to track this. This is only exposed because we need it for some very + * niche use cases. Most consumers should avoid this function. + */ +int +pm_parser_lex_state(const pm_parser_t *parser) { + return (int) parser->lex_state; +} + +/** + * Returns the location associated with the given comment. + */ +pm_location_t +pm_comment_location(const pm_comment_t *comment) { + return comment->location; +} + +/** + * Returns the type associated with the given comment. + */ +pm_comment_type_t +pm_comment_type(const pm_comment_t *comment) { + return comment->type; +} + +/** + * Returns the number of comments associated with the given parser. + */ +size_t +pm_parser_comments_size(const pm_parser_t *parser) { + return parser->comment_list.size; +} + +/** + * Iterates over the comments associated with the given parser and calls the + * given callback for each comment. + */ +void +pm_parser_comments_each(const pm_parser_t *parser, pm_comment_callback_t callback, void *data) { + const pm_list_node_t *current = parser->comment_list.head; + while (current != NULL) { + const pm_comment_t *comment = (const pm_comment_t *) current; + callback(comment, data); + current = current->next; + } +} + +/** + * Returns the location associated with the given magic comment key. + */ +pm_location_t +pm_magic_comment_key(const pm_magic_comment_t *magic_comment) { + return magic_comment->key; +} + +/** + * Returns the location associated with the given magic comment value. + */ +pm_location_t +pm_magic_comment_value(const pm_magic_comment_t *magic_comment) { + return magic_comment->value; +} + +/** + * Returns the number of magic comments associated with the given parser. + */ +size_t +pm_parser_magic_comments_size(const pm_parser_t *parser) { + return parser->magic_comment_list.size; +} + +/** + * Iterates over the magic comments associated with the given parser and calls + * the given callback for each magic comment. + */ +void +pm_parser_magic_comments_each(const pm_parser_t *parser, pm_magic_comment_callback_t callback, void *data) { + const pm_list_node_t *current = parser->magic_comment_list.head; + while (current != NULL) { + const pm_magic_comment_t *magic_comment = (const pm_magic_comment_t *) current; + callback(magic_comment, data); + current = current->next; + } +} + +/** + * Returns the number of errors associated with the given parser. + */ +size_t +pm_parser_errors_size(const pm_parser_t *parser) { + return parser->error_list.size; +} + +/** + * Returns the number of warnings associated with the given parser. + */ +size_t +pm_parser_warnings_size(const pm_parser_t *parser) { + return parser->warning_list.size; +} + +static inline void +pm_parser_diagnostics_each(const pm_list_t *list, pm_diagnostic_callback_t callback, void *data) { + const pm_list_node_t *current = list->head; + while (current != NULL) { + const pm_diagnostic_t *diagnostic = (const pm_diagnostic_t *) current; + callback(diagnostic, data); + current = current->next; + } +} + +/** + * Iterates over the errors associated with the given parser and calls the + * given callback for each error. + */ +void +pm_parser_errors_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) { + pm_parser_diagnostics_each(&parser->error_list, callback, data); +} + +/** + * Iterates over the warnings associated with the given parser and calls the + * given callback for each warning. + */ +void +pm_parser_warnings_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) { + pm_parser_diagnostics_each(&parser->warning_list, callback, data); +} + +/** + * Returns the number of constants in the constant pool associated with the + * given parser. + */ +size_t +pm_parser_constants_size(const pm_parser_t *parser) { + return parser->constant_pool.size; +} + +/** + * Iterates over the constants in the constant pool associated with the given + * parser and calls the given callback for each constant. + */ +void +pm_parser_constants_each(const pm_parser_t *parser, pm_constant_callback_t callback, void *data) { + for (uint32_t index = 0; index < parser->constant_pool.size; index++) { + const pm_constant_t *constant = &parser->constant_pool.constants[index]; + callback(constant, data); + } +} + +/** + * Returns a pointer to the constant at the given id in the constant pool + * associated with the given parser. + */ +const pm_constant_t * +pm_parser_constant(const pm_parser_t *parser, pm_constant_id_t constant_id) { + return pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); +} diff --git a/prism/parser.h b/prism/parser.h index 992729d655..2c8c4b3a7a 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -6,928 +6,343 @@ #ifndef PRISM_PARSER_H #define PRISM_PARSER_H -#include "prism/defines.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + #include "prism/ast.h" -#include "prism/encoding.h" +#include "prism/comments.h" +#include "prism/diagnostic.h" +#include "prism/line_offset_list.h" +#include "prism/magic_comments.h" #include "prism/options.h" -#include "prism/static_literals.h" -#include "prism/util/pm_constant_pool.h" -#include "prism/util/pm_list.h" -#include "prism/util/pm_newline_list.h" -#include "prism/util/pm_string.h" - -#include <stdbool.h> - -/** - * This enum provides various bits that represent different kinds of states that - * the lexer can track. This is used to determine which kind of token to return - * based on the context of the parser. - */ -typedef enum { - PM_LEX_STATE_BIT_BEG, - PM_LEX_STATE_BIT_END, - PM_LEX_STATE_BIT_ENDARG, - PM_LEX_STATE_BIT_ENDFN, - PM_LEX_STATE_BIT_ARG, - PM_LEX_STATE_BIT_CMDARG, - PM_LEX_STATE_BIT_MID, - PM_LEX_STATE_BIT_FNAME, - PM_LEX_STATE_BIT_DOT, - PM_LEX_STATE_BIT_CLASS, - PM_LEX_STATE_BIT_LABEL, - PM_LEX_STATE_BIT_LABELED, - PM_LEX_STATE_BIT_FITEM -} pm_lex_state_bit_t; - -/** - * This enum combines the various bits from the above enum into individual - * values that represent the various states of the lexer. - */ -typedef enum { - PM_LEX_STATE_NONE = 0, - PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), - PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), - PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), - PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), - PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), - PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), - PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), - PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), - PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), - PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), - PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), - PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), - PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), - PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, - PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, - PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN -} pm_lex_state_t; - -/** - * The type of quote that a heredoc uses. - */ -typedef enum { - PM_HEREDOC_QUOTE_NONE, - PM_HEREDOC_QUOTE_SINGLE = '\'', - PM_HEREDOC_QUOTE_DOUBLE = '"', - PM_HEREDOC_QUOTE_BACKTICK = '`', -} pm_heredoc_quote_t; - -/** - * The type of indentation that a heredoc uses. - */ -typedef enum { - PM_HEREDOC_INDENT_NONE, - PM_HEREDOC_INDENT_DASH, - PM_HEREDOC_INDENT_TILDE, -} pm_heredoc_indent_t; - -/** - * All of the information necessary to store to lexing a heredoc. - */ -typedef struct { - /** A pointer to the start of the heredoc identifier. */ - const uint8_t *ident_start; - - /** The length of the heredoc identifier. */ - size_t ident_length; - - /** The type of quote that the heredoc uses. */ - pm_heredoc_quote_t quote; - - /** The type of indentation that the heredoc uses. */ - pm_heredoc_indent_t indent; -} pm_heredoc_lex_mode_t; - -/** - * When lexing Ruby source, the lexer has a small amount of state to tell which - * kind of token it is currently lexing. For example, when we find the start of - * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After - * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that - * are found as part of a string. - */ -typedef struct pm_lex_mode { - /** The type of this lex mode. */ - enum { - /** This state is used when any given token is being lexed. */ - PM_LEX_DEFAULT, - - /** - * This state is used when we're lexing as normal but inside an embedded - * expression of a string. - */ - PM_LEX_EMBEXPR, - - /** - * This state is used when we're lexing a variable that is embedded - * directly inside of a string with the # shorthand. - */ - PM_LEX_EMBVAR, - - /** This state is used when you are inside the content of a heredoc. */ - PM_LEX_HEREDOC, - - /** - * This state is used when we are lexing a list of tokens, as in a %w - * word list literal or a %i symbol list literal. - */ - PM_LEX_LIST, - - /** - * This state is used when a regular expression has been begun and we - * are looking for the terminator. - */ - PM_LEX_REGEXP, - - /** - * This state is used when we are lexing a string or a string-like - * token, as in string content with either quote or an xstring. - */ - PM_LEX_STRING - } mode; - - /** The data associated with this type of lex mode. */ - union { - struct { - /** This keeps track of the nesting level of the list. */ - size_t nesting; - - /** Whether or not interpolation is allowed in this list. */ - bool interpolation; - - /** - * When lexing a list, it takes into account balancing the - * terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** This is the terminator of the list literal. */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the list. - */ - uint8_t breakpoints[11]; - } list; - - struct { - /** - * This keeps track of the nesting level of the regular expression. - */ - size_t nesting; - - /** - * When lexing a regular expression, it takes into account balancing - * the terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** This is the terminator of the regular expression. */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the regular expression. - */ - uint8_t breakpoints[7]; - } regexp; - - struct { - /** This keeps track of the nesting level of the string. */ - size_t nesting; - - /** Whether or not interpolation is allowed in this string. */ - bool interpolation; - - /** - * Whether or not at the end of the string we should allow a :, - * which would indicate this was a dynamic symbol instead of a - * string. - */ - bool label_allowed; - - /** - * When lexing a string, it takes into account balancing the - * terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** - * This is the terminator of the string. It is typically either a - * single or double quote. - */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the string. - */ - uint8_t breakpoints[7]; - } string; - - struct { - /** - * All of the data necessary to lex a heredoc. - */ - pm_heredoc_lex_mode_t base; - - /** - * This is the pointer to the character where lexing should resume - * once the heredoc has been completely processed. - */ - const uint8_t *next_start; - - /** - * This is used to track the amount of common whitespace on each - * line so that we know how much to dedent each line in the case of - * a tilde heredoc. - */ - size_t *common_whitespace; - - /** True if the previous token ended with a line continuation. */ - bool line_continuation; - } heredoc; - } as; - - /** The previous lex state so that it knows how to pop. */ - struct pm_lex_mode *prev; -} pm_lex_mode_t; - -/** - * We pre-allocate a certain number of lex states in order to avoid having to - * call malloc too many times while parsing. You really shouldn't need more than - * this because you only really nest deeply when doing string interpolation. - */ -#define PM_LEX_STACK_SIZE 4 /** * The parser used to parse Ruby source. */ -typedef struct pm_parser pm_parser_t; +typedef struct pm_parser_t pm_parser_t; /** - * While parsing, we keep track of a stack of contexts. This is helpful for - * error recovery so that we can pop back to a previous context when we hit a - * token that is understood by a parent context but not by the current context. + * Allocate and initialize a parser with the given start and end pointers. + * + * @param arena The arena to use for all AST-lifetime allocations. It is caller- + * owned and must outlive the parser. + * @param source The source to parse. + * @param size The size of the source. + * @param options The optional options to use when parsing. These options must + * live for the whole lifetime of this parser. + * @returns The initialized parser. It is the responsibility of the caller to + * free the parser with `pm_parser_free()`. */ -typedef enum { - /** a null context, used for returning a value from a function */ - PM_CONTEXT_NONE = 0, - - /** a begin statement */ - PM_CONTEXT_BEGIN, - - /** an ensure statement with an explicit begin */ - PM_CONTEXT_BEGIN_ENSURE, - - /** a rescue else statement with an explicit begin */ - PM_CONTEXT_BEGIN_ELSE, - - /** a rescue statement with an explicit begin */ - PM_CONTEXT_BEGIN_RESCUE, - - /** expressions in block arguments using braces */ - PM_CONTEXT_BLOCK_BRACES, - - /** expressions in block arguments using do..end */ - PM_CONTEXT_BLOCK_KEYWORDS, - - /** an ensure statement within a do..end block */ - PM_CONTEXT_BLOCK_ENSURE, - - /** a rescue else statement within a do..end block */ - PM_CONTEXT_BLOCK_ELSE, - - /** a rescue statement within a do..end block */ - PM_CONTEXT_BLOCK_RESCUE, - - /** a case when statements */ - PM_CONTEXT_CASE_WHEN, - - /** a case in statements */ - PM_CONTEXT_CASE_IN, - - /** a class declaration */ - PM_CONTEXT_CLASS, - - /** an ensure statement within a class statement */ - PM_CONTEXT_CLASS_ENSURE, - - /** a rescue else statement within a class statement */ - PM_CONTEXT_CLASS_ELSE, - - /** a rescue statement within a class statement */ - PM_CONTEXT_CLASS_RESCUE, - - /** a method definition */ - PM_CONTEXT_DEF, - - /** an ensure statement within a method definition */ - PM_CONTEXT_DEF_ENSURE, - - /** a rescue else statement within a method definition */ - PM_CONTEXT_DEF_ELSE, - - /** a rescue statement within a method definition */ - PM_CONTEXT_DEF_RESCUE, - - /** a method definition's parameters */ - PM_CONTEXT_DEF_PARAMS, - - /** a defined? expression */ - PM_CONTEXT_DEFINED, - - /** a method definition's default parameter */ - PM_CONTEXT_DEFAULT_PARAMS, - - /** an else clause */ - PM_CONTEXT_ELSE, - - /** an elsif clause */ - PM_CONTEXT_ELSIF, - - /** an interpolated expression */ - PM_CONTEXT_EMBEXPR, - - /** a for loop */ - PM_CONTEXT_FOR, - - /** a for loop's index */ - PM_CONTEXT_FOR_INDEX, - - /** an if statement */ - PM_CONTEXT_IF, - - /** a lambda expression with braces */ - PM_CONTEXT_LAMBDA_BRACES, - - /** a lambda expression with do..end */ - PM_CONTEXT_LAMBDA_DO_END, - - /** an ensure statement within a lambda expression */ - PM_CONTEXT_LAMBDA_ENSURE, +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_parser_t * pm_parser_new(pm_arena_t *arena, const uint8_t *source, size_t size, const pm_options_t *options) PRISM_NONNULL(1); - /** a rescue else statement within a lambda expression */ - PM_CONTEXT_LAMBDA_ELSE, - - /** a rescue statement within a lambda expression */ - PM_CONTEXT_LAMBDA_RESCUE, +/** + * Free both the memory held by the given parser and the parser itself. + * + * @param parser The parser to free. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser) PRISM_NONNULL(1); - /** the predicate clause of a loop statement */ - PM_CONTEXT_LOOP_PREDICATE, +/** + * When the encoding that is being used to parse the source is changed by prism, + * we provide the ability here to call out to a user-defined function. + */ +typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); - /** the top level context */ - PM_CONTEXT_MAIN, +/** + * This is the callback that is called when a token is lexed. It is passed + * the opaque data pointer, the parser, and the token that was lexed. + */ +typedef void (*pm_lex_callback_t)(pm_parser_t *parser, pm_token_t *token, void *data); - /** a module declaration */ - PM_CONTEXT_MODULE, +/** + * Register a callback that will be called whenever prism changes the encoding + * it is using to parse based on the magic comment. + * + * @param parser The parser to register the callback with. + * @param callback The callback to register. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_encoding_changed_callback_set(pm_parser_t *parser, pm_encoding_changed_callback_t callback) PRISM_NONNULL(1); - /** an ensure statement within a module statement */ - PM_CONTEXT_MODULE_ENSURE, +/** + * Register a callback that will be called whenever a token is lexed. + * + * @param parser The parser to register the callback with. + * @param data The opaque data to pass to the callback when it is called. + * @param callback The callback to register. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_lex_callback_set(pm_parser_t *parser, pm_lex_callback_t callback, void *data) PRISM_NONNULL(1); - /** a rescue else statement within a module statement */ - PM_CONTEXT_MODULE_ELSE, +/** + * Returns the opaque data that is passed to the lex callback when it is called. + * + * @param parser The parser whose lex callback data we want to get. + * @returns The opaque data that is passed to the lex callback when it is called. + */ +PRISM_EXPORTED_FUNCTION void * pm_parser_lex_callback_data(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue statement within a module statement */ - PM_CONTEXT_MODULE_RESCUE, +/** + * Returns the raw pointer to the start of the source that is being parsed. + * + * @param parser the parser whose start pointer we want to get + * @returns the raw pointer to the start of the source that is being parsed + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_start(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a multiple target expression */ - PM_CONTEXT_MULTI_TARGET, +/** + * Returns the raw pointer to the end of the source that is being parsed. + * + * @param parser the parser whose end pointer we want to get + * @returns the raw pointer to the end of the source that is being parsed + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_end(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a parenthesized expression */ - PM_CONTEXT_PARENS, +/** + * Returns the line that the parser was considered to have started on. + * + * @param parser the parser whose start line we want to get + * @returns the line that the parser was considered to have started on + */ +PRISM_EXPORTED_FUNCTION int32_t pm_parser_start_line(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an END block */ - PM_CONTEXT_POSTEXE, +/** + * Returns the name of the encoding that is being used to parse the source. + * + * @param parser the parser whose encoding name we want to get + * @returns the name of the encoding that is being used to parse the source + */ +PRISM_EXPORTED_FUNCTION const char * pm_parser_encoding_name(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a predicate inside an if/elsif/unless statement */ - PM_CONTEXT_PREDICATE, +/** + * Returns the width of the character at the given pointer in the encoding that + * is being used to parse the source. + * + * @param parser the parser whose encoding we want to use + * @param start a pointer to the start of the character + * @param remaining the number of bytes remaining in the source + * @returns the width of the character in bytes + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_encoding_char_width(const pm_parser_t *parser, const uint8_t *start, ptrdiff_t remaining) PRISM_NONNULL(1, 2); - /** a BEGIN block */ - PM_CONTEXT_PREEXE, +/** + * Returns whether or not the parser is using the US-ASCII encoding. + * + * @param parser the parser to check + * @returns true if the parser is using US-ASCII encoding, false otherwise + */ +PRISM_EXPORTED_FUNCTION bool pm_parser_encoding_us_ascii(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a modifier rescue clause */ - PM_CONTEXT_RESCUE_MODIFIER, +/** + * Returns the filepath that is being used to parse the source. + * + * @param parser the parser whose filepath we want to get + * @returns a pointer to the filepath string + */ +PRISM_EXPORTED_FUNCTION const pm_string_t * pm_parser_filepath(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a singleton class definition */ - PM_CONTEXT_SCLASS, +/** + * Find a constant in the parser's constant pool. Returns the id of the + * constant, or 0 if the constant is not found. + * + * @param parser the parser whose constant pool we want to search + * @param start a pointer to the start of the string to search for + * @param length the length of the string to search for + * @returns the id of the constant, or 0 if the constant is not found + */ +PRISM_EXPORTED_FUNCTION pm_constant_id_t pm_parser_constant_find(const pm_parser_t *parser, const uint8_t *start, size_t length) PRISM_NONNULL(1, 2); - /** an ensure statement with a singleton class */ - PM_CONTEXT_SCLASS_ENSURE, +/** + * Returns the frozen string literal value of the parser, as determined by the + * frozen_string_literal magic comment or the option set on the parser. + * + * @param parser the parser whose frozen string literal value we want to get + * @returns -1 if disabled, 0 if unset, 1 if enabled + */ +PRISM_EXPORTED_FUNCTION int8_t pm_parser_frozen_string_literal(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue else statement with a singleton class */ - PM_CONTEXT_SCLASS_ELSE, +/** + * Returns the line offsets that are associated with the given parser. + * + * @param parser the parser whose line offsets we want to get + * @returns the line offsets that are associated with the given parser + */ +PRISM_EXPORTED_FUNCTION const pm_line_offset_list_t * pm_parser_line_offsets(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue statement with a singleton class */ - PM_CONTEXT_SCLASS_RESCUE, +/** + * Returns the location of the __DATA__ section that is associated with the + * given parser. + * + * @param parser the parser whose data location we want to get + * @returns the location of the __DATA__ section that is associated with the + * given parser. If it is unset, then the length will be set to 0. + */ +PRISM_EXPORTED_FUNCTION const pm_location_t * pm_parser_data_loc(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a ternary expression */ - PM_CONTEXT_TERNARY, +/** + * Returns whether the given parser is continuable, meaning that it could become + * valid if more input were appended, as opposed to being definitively invalid. + * + * @param parser the parser whose continuable status we want to get + * @returns whether the given parser is continuable + */ +PRISM_EXPORTED_FUNCTION bool pm_parser_continuable(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an unless statement */ - PM_CONTEXT_UNLESS, +/** + * Returns the lex state of the parser. Note that this is an internal detail, + * and we are purposefully not returning an instance of the internal enum that + * we use to track this. This is only exposed because we need it for some very + * niche use cases. Most consumers should avoid this function. + * + * @param parser the parser whose lex state we want to get + * @returns the lex state of the parser + */ +PRISM_EXPORTED_FUNCTION int pm_parser_lex_state(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an until statement */ - PM_CONTEXT_UNTIL, +/** + * Returns the number of comments associated with the given parser. + * + * @param parser the parser whose comments we want to get the size of + * @returns the number of comments associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a while statement */ - PM_CONTEXT_WHILE, -} pm_context_t; +/** + * A callback function that can be used to process comments found while parsing. + */ +typedef void (*pm_comment_callback_t)(const pm_comment_t *comment, void *data); -/** This is a node in a linked list of contexts. */ -typedef struct pm_context_node { - /** The context that this node represents. */ - pm_context_t context; +/** + * Iterates over the comments associated with the given parser and calls the + * given callback for each comment. + * + * @param parser the parser whose comments we want to iterate over + * @param callback the callback function to call for each comment. This function + * will be passed a pointer to the comment and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each comment. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_comments_each(const pm_parser_t *parser, pm_comment_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the previous context in the linked list. */ - struct pm_context_node *prev; -} pm_context_node_t; +/** + * Returns the number of magic comments associated with the given parser. + * + * @param parser the parser whose magic comments we want to get the size of + * @returns the number of magic comments associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_magic_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1); -/** This is the type of a comment that we've found while parsing. */ -typedef enum { - PM_COMMENT_INLINE, - PM_COMMENT_EMBDOC -} pm_comment_type_t; +/** + * A callback function that can be used to process magic comments found while parsing. + */ +typedef void (*pm_magic_comment_callback_t)(const pm_magic_comment_t *magic_comment, void *data); /** - * This is a node in the linked list of comments that we've found while parsing. + * Iterates over the magic comments associated with the given parser and calls the + * given callback for each magic comment. * - * @extends pm_list_node_t + * @param parser the parser whose magic comments we want to iterate over + * @param callback the callback function to call for each magic comment. This + * function will be passed a pointer to the magic comment and the data + * parameter passed to this function. + * @param data the data to pass to the callback function for each magic comment. + * This can be NULL if no data needs to be passed to the callback function. */ -typedef struct pm_comment { - /** The embedded base node. */ - pm_list_node_t node; +PRISM_EXPORTED_FUNCTION void pm_parser_magic_comments_each(const pm_parser_t *parser, pm_magic_comment_callback_t callback, void *data) PRISM_NONNULL(1); - /** The location of the comment in the source. */ - pm_location_t location; +/** + * Returns the number of errors associated with the given parser. + * + * @param parser the parser whose errors we want to get the size of + * @returns the number of errors associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_errors_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** The type of comment that we've found. */ - pm_comment_type_t type; -} pm_comment_t; +/** + * Returns the number of warnings associated with the given parser. + * + * @param parser the parser whose warnings we want to get the size of + * @returns the number of warnings associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_warnings_size(const pm_parser_t *parser) PRISM_NONNULL(1); /** - * This is a node in the linked list of magic comments that we've found while + * A callback function that can be used to process diagnostics found while * parsing. + */ +typedef void (*pm_diagnostic_callback_t)(const pm_diagnostic_t *diagnostic, void *data); + +/** + * Iterates over the errors associated with the given parser and calls the + * given callback for each error. * - * @extends pm_list_node_t + * @param parser the parser whose errors we want to iterate over + * @param callback the callback function to call for each error. This function + * will be passed a pointer to the error and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each error. This + * can be NULL if no data needs to be passed to the callback function. */ -typedef struct { - /** The embedded base node. */ - pm_list_node_t node; +PRISM_EXPORTED_FUNCTION void pm_parser_errors_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the start of the key in the source. */ - const uint8_t *key_start; +/** + * Iterates over the warnings associated with the given parser and calls the + * given callback for each warning. + * + * @param parser the parser whose warnings we want to iterate over + * @param callback the callback function to call for each warning. This function + * will be passed a pointer to the warning and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each warning. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_warnings_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the start of the value in the source. */ - const uint8_t *value_start; +/** + * Returns the number of constants in the constant pool associated with the + * given parser. + * + * @param parser the parser whose constant pool constants we want to get the + * size of + * @returns the number of constants in the constant pool associated with the + * given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_constants_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** The length of the key in the source. */ - uint32_t key_length; +/** + * A callback function that can be used to process constants found while + * parsing. + */ +typedef void (*pm_constant_callback_t)(const pm_constant_t *constant, void *data); - /** The length of the value in the source. */ - uint32_t value_length; -} pm_magic_comment_t; +/** + * Iterates over the constants in the constant pool associated with the given + * parser and calls the given callback for each constant. + * + * @param parser the parser whose constants we want to iterate over + * @param callback the callback function to call for each constant. This function + * will be passed a pointer to the constant and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each constant. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_constants_each(const pm_parser_t *parser, pm_constant_callback_t callback, void *data) PRISM_NONNULL(1); /** - * When the encoding that is being used to parse the source is changed by prism, - * we provide the ability here to call out to a user-defined function. + * Returns a pointer to the constant at the given id in the constant pool + * associated with the given parser. + * + * @param parser the parser whose constant pool we want to look up from + * @param constant_id the id of the constant to look up (1-based) + * @returns a pointer to the constant at the given id */ -typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); +PRISM_EXPORTED_FUNCTION const pm_constant_t * pm_parser_constant(const pm_parser_t *parser, pm_constant_id_t constant_id) PRISM_NONNULL(1); /** - * When you are lexing through a file, the lexer needs all of the information - * that the parser additionally provides (for example, the local table). So if - * you want to properly lex Ruby, you need to actually lex it in the context of - * the parser. In order to provide this functionality, we optionally allow a - * struct to be attached to the parser that calls back out to a user-provided - * callback when each token is lexed. - */ -typedef struct { - /** - * This opaque pointer is used to provide whatever information the user - * deemed necessary to the callback. In our case we use it to pass the array - * that the tokens get appended into. - */ - void *data; - - /** - * This is the callback that is called when a token is lexed. It is passed - * the opaque data pointer, the parser, and the token that was lexed. - */ - void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token); -} pm_lex_callback_t; - -/** The type of shareable constant value that can be set. */ -typedef uint8_t pm_shareable_constant_value_t; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; - -/** - * This tracks an individual local variable in a certain lexical context, as - * well as the number of times is it read. - */ -typedef struct { - /** The name of the local variable. */ - pm_constant_id_t name; - - /** The location of the local variable in the source. */ - pm_location_t location; - - /** The index of the local variable in the local table. */ - uint32_t index; - - /** The number of times the local variable is read. */ - uint32_t reads; - - /** The hash of the local variable. */ - uint32_t hash; -} pm_local_t; - -/** - * This is a set of local variables in a certain lexical context (method, class, - * module, etc.). We need to track how many times these variables are read in - * order to warn if they only get written. - */ -typedef struct pm_locals { - /** The number of local variables in the set. */ - uint32_t size; - - /** The capacity of the local variables set. */ - uint32_t capacity; - - /** The nullable allocated memory for the local variables in the set. */ - pm_local_t *locals; -} pm_locals_t; - -/** The flags about scope parameters that can be set. */ -typedef uint8_t pm_scope_parameters_t; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; - -/** - * This struct represents a node in a linked list of scopes. Some scopes can see - * into their parent scopes, while others cannot. - */ -typedef struct pm_scope { - /** A pointer to the previous scope in the linked list. */ - struct pm_scope *previous; - - /** The IDs of the locals in the given scope. */ - pm_locals_t locals; - - /** - * This is a list of the implicit parameters contained within the block. - * These will be processed after the block is parsed to determine the kind - * of parameters node that should be used and to check if any errors need to - * be added. - */ - pm_node_list_t implicit_parameters; - - /** - * This is a bitfield that indicates the parameters that are being used in - * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. - * There are three different kinds of parameters that can be used in a - * scope: - * - * - Ordinary parameters (e.g., def foo(bar); end) - * - Numbered parameters (e.g., def foo; _1; end) - * - The it parameter (e.g., def foo; it; end) - * - * If ordinary parameters are being used, then certain parameters can be - * forwarded to another method/structure. Those are indicated by four - * additional bits in the params field. For example, some combinations of: - * - * - def foo(*); end - * - def foo(**); end - * - def foo(&); end - * - def foo(...); end - */ - pm_scope_parameters_t parameters; - - /** - * The current state of constant shareability for this scope. This is - * changed by magic shareable_constant_value comments. - */ - pm_shareable_constant_value_t shareable_constant; - - /** - * A boolean indicating whether or not this scope can see into its parent. - * If closed is true, then the scope cannot see into its parent. - */ - bool closed; -} pm_scope_t; - -/** - * A struct that represents a stack of boolean values. - */ -typedef uint32_t pm_state_stack_t; - -/** - * This struct represents the overall parser. It contains a reference to the - * source file, as well as pointers that indicate where in the source it's - * currently parsing. It also contains the most recent and current token that - * it's considering. - */ -struct pm_parser { - /** - * The next node identifier that will be assigned. This is a unique - * identifier used to track nodes such that the syntax tree can be dropped - * but the node can be found through another parse. - */ - uint32_t node_id; - - /** The current state of the lexer. */ - pm_lex_state_t lex_state; - - /** Tracks the current nesting of (), [], and {}. */ - int enclosure_nesting; - - /** - * Used to temporarily track the nesting of enclosures to determine if a { - * is the beginning of a lambda following the parameters of a lambda. - */ - int lambda_enclosure_nesting; - - /** - * Used to track the nesting of braces to ensure we get the correct value - * when we are interpolating blocks with braces. - */ - int brace_nesting; - - /** - * The stack used to determine if a do keyword belongs to the predicate of a - * while, until, or for loop. - */ - pm_state_stack_t do_loop_stack; - - /** - * The stack used to determine if a do keyword belongs to the beginning of a - * block. - */ - pm_state_stack_t accepts_block_stack; - - /** A stack of lex modes. */ - struct { - /** The current mode of the lexer. */ - pm_lex_mode_t *current; - - /** The stack of lexer modes. */ - pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; - - /** The current index into the lexer mode stack. */ - size_t index; - } lex_modes; - - /** The pointer to the start of the source. */ - const uint8_t *start; - - /** The pointer to the end of the source. */ - const uint8_t *end; - - /** The previous token we were considering. */ - pm_token_t previous; - - /** The current token we're considering. */ - pm_token_t current; - - /** - * This is a special field set on the parser when we need the parser to jump - * to a specific location when lexing the next token, as opposed to just - * using the end of the previous token. Normally this is NULL. - */ - const uint8_t *next_start; - - /** - * This field indicates the end of a heredoc whose identifier was found on - * the current line. If another heredoc is found on the same line, then this - * will be moved forward to the end of that heredoc. If no heredocs are - * found on a line then this is NULL. - */ - const uint8_t *heredoc_end; - - /** The list of comments that have been found while parsing. */ - pm_list_t comment_list; - - /** The list of magic comments that have been found while parsing. */ - pm_list_t magic_comment_list; - - /** - * An optional location that represents the location of the __END__ marker - * and the rest of the content of the file. This content is loaded into the - * DATA constant when the file being parsed is the main file being executed. - */ - pm_location_t data_loc; - - /** The list of warnings that have been found while parsing. */ - pm_list_t warning_list; - - /** The list of errors that have been found while parsing. */ - pm_list_t error_list; - - /** The current local scope. */ - pm_scope_t *current_scope; - - /** The current parsing context. */ - pm_context_node_t *current_context; - - /** - * The hash keys for the hash that is currently being parsed. This is not - * usually necessary because it can pass it down the various call chains, - * but in the event that you're parsing a hash that is being directly - * pushed into another hash with **, we need to share the hash keys so that - * we can warn for the nested hash as well. - */ - pm_static_literals_t *current_hash_keys; - - /** - * The encoding functions for the current file is attached to the parser as - * it's parsing so that it can change with a magic comment. - */ - const pm_encoding_t *encoding; - - /** - * When the encoding that is being used to parse the source is changed by - * prism, we provide the ability here to call out to a user-defined - * function. - */ - pm_encoding_changed_callback_t encoding_changed_callback; - - /** - * This pointer indicates where a comment must start if it is to be - * considered an encoding comment. - */ - const uint8_t *encoding_comment_start; - - /** - * This is an optional callback that can be attached to the parser that will - * be called whenever a new token is lexed by the parser. - */ - pm_lex_callback_t *lex_callback; - - /** - * This is the path of the file being parsed. We use the filepath when - * constructing SourceFileNodes. - */ - pm_string_t filepath; - - /** - * This constant pool keeps all of the constants defined throughout the file - * so that we can reference them later. - */ - pm_constant_pool_t constant_pool; - - /** This is the list of newline offsets in the source file. */ - pm_newline_list_t newline_list; - - /** - * We want to add a flag to integer nodes that indicates their base. We only - * want to parse these once, but we don't have space on the token itself to - * communicate this information. So we store it here and pass it through - * when we find tokens that we need it for. - */ - pm_node_flags_t integer_base; - - /** - * This string is used to pass information from the lexer to the parser. It - * is particularly necessary because of escape sequences. - */ - pm_string_t current_string; - - /** - * The line number at the start of the parse. This will be used to offset - * the line numbers of all of the locations. - */ - int32_t start_line; - - /** - * When a string-like expression is being lexed, any byte or escape sequence - * that resolves to a value whose top bit is set (i.e., >= 0x80) will - * explicitly set the encoding to the same encoding as the source. - * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that - * resolves to a value whose top bit is set, then the encoding will be - * explicitly set to UTF-8. - * - * The _next_ time this happens, if the encoding that is about to become the - * explicitly set encoding does not match the previously set explicit - * encoding, a mixed encoding error will be emitted. - * - * When the expression is finished being lexed, the explicit encoding - * controls the encoding of the expression. For the most part this means - * that the expression will either be encoded in the source encoding or - * UTF-8. This holds for all encodings except US-ASCII. If the source is - * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the - * expression will be encoded as ASCII-8BIT. - * - * Note that if the expression is a list, different elements within the same - * list can have different encodings, so this will get reset between each - * element. Furthermore all of this only applies to lists that support - * interpolation, because otherwise escapes that could change the encoding - * are ignored. - * - * At first glance, it may make more sense for this to live on the lexer - * mode, but we need it here to communicate back to the parser for character - * literals that do not push a new lexer mode. - */ - const pm_encoding_t *explicit_encoding; - - /** - * When parsing block exits (e.g., break, next, redo), we need to validate - * that they are in correct contexts. For the most part we can do this by - * looking at our parent contexts. However, modifier while and until - * expressions can change that context to make block exits valid. In these - * cases, we need to keep track of the block exits and then validate them - * after the expression has been parsed. - * - * We use a pointer here because we don't want to keep a whole list attached - * since this will only be used in the context of begin/end expressions. - */ - pm_node_list_t *current_block_exits; - - /** The version of prism that we should use to parse. */ - pm_options_version_t version; - - /** The command line flags given from the options. */ - uint8_t command_line; - - /** - * Whether or not we have found a frozen_string_literal magic comment with - * a true or false value. - * May be: - * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET - */ - int8_t frozen_string_literal; - - /** - * Whether or not we are parsing an eval string. This impacts whether or not - * we should evaluate if block exits/yields are valid. - */ - bool parsing_eval; - - /** - * Whether or not we are parsing a "partial" script, which is a script that - * will be evaluated in the context of another script, so we should not - * check jumps (next/break/etc.) for validity. - */ - bool partial_script; - - /** Whether or not we're at the beginning of a command. */ - bool command_start; - - /** Whether or not we're currently recovering from a syntax error. */ - bool recovering; - - /** - * This is very specialized behavior for when you want to parse in a context - * that does not respect encoding comments. Its main use case is translating - * into the whitequark/parser AST which re-encodes source files in UTF-8 - * before they are parsed and ignores encoding comments. - */ - bool encoding_locked; - - /** - * Whether or not the encoding has been changed by a magic comment. We use - * this to provide a fast path for the lexer instead of going through the - * function pointer. - */ - bool encoding_changed; - - /** - * This flag indicates that we are currently parsing a pattern matching - * expression and impacts that calculation of newlines. - */ - bool pattern_matching_newlines; - - /** This flag indicates that we are currently parsing a keyword argument. */ - bool in_keyword_arg; - - /** - * Whether or not the parser has seen a token that has semantic meaning - * (i.e., a token that is not a comment or whitespace). - */ - bool semantic_token_seen; - - /** - * True if the current regular expression being lexed contains only ASCII - * characters. - */ - bool current_regular_expression_ascii_only; - - /** - * By default, Ruby always warns about mismatched indentation. This can be - * toggled with a magic comment. - */ - bool warn_mismatched_indentation; -}; + * Initiate the parser with the given parser. + * + * @param parser The parser to use. + * @returns The AST representing the source. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser) PRISM_NONNULL(1); #endif diff --git a/prism/prettyprint.h b/prism/prettyprint.h index 5a52b2b6b8..0d8e416341 100644 --- a/prism/prettyprint.h +++ b/prism/prettyprint.h @@ -6,19 +6,16 @@ #ifndef PRISM_PRETTYPRINT_H #define PRISM_PRETTYPRINT_H -#include "prism/defines.h" +#include "prism/excludes.h" -#ifdef PRISM_EXCLUDE_PRETTYPRINT +#ifndef PRISM_EXCLUDE_PRETTYPRINT -void pm_prettyprint(void); - -#else - -#include <stdio.h> +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" #include "prism/ast.h" +#include "prism/buffer.h" #include "prism/parser.h" -#include "prism/util/pm_buffer.h" /** * Pretty-prints the AST represented by the given node to the given buffer. @@ -27,7 +24,7 @@ void pm_prettyprint(void); * @param parser The parser that parsed the AST. * @param node The root node of the AST to pretty-print. */ -PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node); +PRISM_EXPORTED_FUNCTION void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node) PRISM_NONNULL(1, 2, 3); #endif diff --git a/prism/prism.c b/prism/prism.c index cc634b59e3..a8bbcea097 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -1,4 +1,90 @@ -#include "prism.h" +#include "prism/compiler/accel.h" +#include "prism/compiler/fallthrough.h" +#include "prism/compiler/unused.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/arena.h" +#include "prism/internal/bit.h" +#include "prism/internal/buffer.h" +#include "prism/internal/char.h" +#include "prism/internal/comments.h" +#include "prism/internal/constant_pool.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/integer.h" +#include "prism/internal/isinf.h" +#include "prism/internal/line_offset_list.h" +#include "prism/internal/list.h" +#include "prism/internal/magic_comments.h" +#include "prism/internal/memchr.h" +#include "prism/internal/node.h" +#include "prism/internal/options.h" +#include "prism/internal/parser.h" +#include "prism/internal/regexp.h" +#include "prism/internal/serialize.h" +#include "prism/internal/source.h" +#include "prism/internal/static_literals.h" +#include "prism/internal/stringy.h" +#include "prism/internal/strncasecmp.h" +#include "prism/internal/strpbrk.h" +#include "prism/internal/tokens.h" + +#include "prism/excludes.h" +#include "prism/serialize.h" +#include "prism/stream.h" +#include "prism/version.h" + +#include <assert.h> +#include <errno.h> +#include <limits.h> +#include <locale.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> + +/** + * When we are parsing using recursive descent, we want to protect against + * malicious payloads that could attempt to crash our parser. We do this by + * specifying a maximum depth to which we are allowed to recurse. + */ +#ifndef PRISM_DEPTH_MAXIMUM + #define PRISM_DEPTH_MAXIMUM 10000 +#endif + +/** + * A simple utility macro to concatenate two tokens together, necessary when one + * of the tokens is itself a macro. + */ +#define PM_CONCATENATE(left, right) left ## right + +/** + * We want to be able to use static assertions, but they weren't standardized + * until C11. As such, we polyfill it here by making a hacky typedef that will + * fail to compile due to a negative array size if the condition is false. + */ +#if defined(_Static_assert) +# define PM_STATIC_ASSERT(line, condition, message) _Static_assert(condition, message) +#else +# define PM_STATIC_ASSERT(line, condition, message) typedef char PM_CONCATENATE(static_assert_, line)[(condition) ? 1 : -1] +#endif + +/** + * Support PRISM_LIKELY and PRISM_UNLIKELY to help the compiler optimize its + * branch predication. + */ +#if defined(__GNUC__) || defined(__clang__) + /** The compiler should predicate that this branch will be taken. */ + #define PRISM_LIKELY(x) __builtin_expect(!!(x), 1) + + /** The compiler should predicate that this branch will not be taken. */ + #define PRISM_UNLIKELY(x) __builtin_expect(!!(x), 0) +#else + /** Void because this platform does not support branch prediction hints. */ + #define PRISM_LIKELY(x) (x) + + /** Void because this platform does not support branch prediction hints. */ + #define PRISM_UNLIKELY(x) (x) +#endif /** * The prism version and the serialization format. @@ -19,6 +105,51 @@ pm_version(void) { #define MAX(a,b) (((a)>(b))?(a):(b)) /******************************************************************************/ +/* Helpful AST-related macros */ +/******************************************************************************/ + +#define U32(value_) ((uint32_t) (value_)) + +#define FL PM_NODE_FLAGS +#define UP PM_NODE_UPCAST + +#define PM_LOCATION_START(location_) ((location_)->start) +#define PM_LOCATION_END(location_) ((location_)->start + (location_)->length) + +#define PM_TOKEN_START(parser_, token_) U32((token_)->start - (parser_)->start) +#define PM_TOKEN_END(parser_, token_) U32((token_)->end - (parser_)->start) +#define PM_TOKEN_LENGTH(token_) U32((token_)->end - (token_)->start) +#define PM_TOKENS_LENGTH(left_, right_) U32((right_)->end - (left_)->start) + +#define PM_NODE_START(node_) (UP(node_)->location.start) +#define PM_NODE_LENGTH(node_) (UP(node_)->location.length) +#define PM_NODE_END(node_) (UP(node_)->location.start + UP(node_)->location.length) +#define PM_NODES_LENGTH(left_, right_) (PM_NODE_END(right_) - PM_NODE_START(left_)) + +#define PM_TOKEN_NODE_LENGTH(parser_, token_, node_) (PM_NODE_END(node_) - PM_TOKEN_START(parser_, token_)) +#define PM_NODE_TOKEN_LENGTH(parser_, node_, token_) (PM_TOKEN_END(parser_, token_) - PM_NODE_START(node_)) + +#define PM_NODE_START_SET_NODE(left_, right_) (PM_NODE_START(left_) = PM_NODE_START(right_)) +#define PM_NODE_START_SET_TOKEN(parser_, node_, token_) (PM_NODE_START(node_) = PM_TOKEN_START(parser_, token_)) +#define PM_NODE_LENGTH_SET_NODE(left_, right_) (PM_NODE_LENGTH(left_) = PM_NODE_END(right_) - PM_NODE_START(left_)) +#define PM_NODE_LENGTH_SET_TOKEN(parser_, node_, token_) (PM_NODE_LENGTH(node_) = PM_TOKEN_END(parser_, token_) - PM_NODE_START(node_)) +#define PM_NODE_LENGTH_SET_LOCATION(node_, location_) (PM_NODE_LENGTH(node_) = PM_LOCATION_END(location_) - PM_NODE_START(node_)) + +#define PM_LOCATION_INIT(start_, length_) ((pm_location_t) { .start = (start_), .length = (length_) }) +#define PM_LOCATION_INIT_UNSET PM_LOCATION_INIT(0, 0) +#define PM_LOCATION_INIT_TOKEN(parser_, token_) PM_LOCATION_INIT(PM_TOKEN_START(parser_, token_), PM_TOKEN_LENGTH(token_)) +#define PM_LOCATION_INIT_NODE(node_) UP(node_)->location + +#define PM_LOCATION_INIT_TOKENS(parser_, left_, right_) PM_LOCATION_INIT(PM_TOKEN_START(parser_, left_), PM_TOKENS_LENGTH(left_, right_)) +#define PM_LOCATION_INIT_NODES(left_, right_) PM_LOCATION_INIT(PM_NODE_START(left_), PM_NODES_LENGTH(left_, right_)) +#define PM_LOCATION_INIT_TOKEN_NODE(parser_, token_, node_) PM_LOCATION_INIT(PM_TOKEN_START(parser_, token_), PM_TOKEN_NODE_LENGTH(parser_, token_, node_)) +#define PM_LOCATION_INIT_NODE_TOKEN(parser_, node_, token_) PM_LOCATION_INIT(PM_NODE_START(node_), PM_NODE_TOKEN_LENGTH(parser_, node_, token_)) + +#define TOK2LOC(parser_, token_) PM_LOCATION_INIT_TOKEN(parser_, token_) +#define NTOK2LOC(parser_, token_) ((token_) == NULL ? PM_LOCATION_INIT_UNSET : TOK2LOC(parser_, token_)) +#define NTOK2PTR(token_) ((token_).start == NULL ? NULL : &(token_)) + +/******************************************************************************/ /* Lex mode manipulations */ /******************************************************************************/ @@ -26,7 +157,7 @@ pm_version(void) { * Returns the incrementor character that should be used to increment the * nesting count if one is possible. */ -static inline uint8_t +static PRISM_INLINE uint8_t lex_mode_incrementor(const uint8_t start) { switch (start) { case '(': @@ -43,7 +174,7 @@ lex_mode_incrementor(const uint8_t start) { * Returns the matching character that should be used to terminate a list * beginning with the given character. */ -static inline uint8_t +static PRISM_INLINE uint8_t lex_mode_terminator(const uint8_t start) { switch (start) { case '(': @@ -85,7 +216,7 @@ lex_mode_push(pm_parser_t *parser, pm_lex_mode_t lex_mode) { /** * Push on a new list lex mode. */ -static inline bool +static PRISM_INLINE bool lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) { uint8_t incrementor = lex_mode_incrementor(delimiter); uint8_t terminator = lex_mode_terminator(delimiter); @@ -103,7 +234,8 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) { // These are the places where we need to split up the content of the list. // We'll use strpbrk to find the first of these characters. uint8_t *breakpoints = lex_mode.as.list.breakpoints; - memcpy(breakpoints, "\\ \t\f\r\v\n\0\0\0", sizeof(lex_mode.as.list.breakpoints)); + memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE); + memcpy(breakpoints, "\\ \t\f\r\v\n", sizeof("\\ \t\f\r\v\n") - 1); size_t index = 7; // Now we'll add the terminator to the list of breakpoints. If the @@ -132,7 +264,7 @@ lex_mode_push_list(pm_parser_t *parser, bool interpolation, uint8_t delimiter) { * called when we're at the end of the file. We want the parser to be able to * perform its normal error tolerance. */ -static inline bool +static PRISM_INLINE bool lex_mode_push_list_eof(pm_parser_t *parser) { return lex_mode_push_list(parser, false, '\0'); } @@ -140,7 +272,7 @@ lex_mode_push_list_eof(pm_parser_t *parser) { /** * Push on a new regexp lex mode. */ -static inline bool +static PRISM_INLINE bool lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminator) { pm_lex_mode_t lex_mode = { .mode = PM_LEX_REGEXP, @@ -155,7 +287,8 @@ lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminato // regular expression. We'll use strpbrk to find the first of these // characters. uint8_t *breakpoints = lex_mode.as.regexp.breakpoints; - memcpy(breakpoints, "\r\n\\#\0\0", sizeof(lex_mode.as.regexp.breakpoints)); + memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE); + memcpy(breakpoints, "\r\n\\#", sizeof("\r\n\\#") - 1); size_t index = 4; // First we'll add the terminator. @@ -175,7 +308,7 @@ lex_mode_push_regexp(pm_parser_t *parser, uint8_t incrementor, uint8_t terminato /** * Push on a new string lex mode. */ -static inline bool +static PRISM_INLINE bool lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed, uint8_t incrementor, uint8_t terminator) { pm_lex_mode_t lex_mode = { .mode = PM_LEX_STRING, @@ -191,7 +324,8 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed // These are the places where we need to split up the content of the // string. We'll use strpbrk to find the first of these characters. uint8_t *breakpoints = lex_mode.as.string.breakpoints; - memcpy(breakpoints, "\r\n\\\0\0\0", sizeof(lex_mode.as.string.breakpoints)); + memset(breakpoints, 0, PM_STRPBRK_CACHE_SIZE); + memcpy(breakpoints, "\r\n\\", sizeof("\r\n\\") - 1); size_t index = 3; // Now add in the terminator. If the terminator is not already a NULL byte, @@ -221,7 +355,7 @@ lex_mode_push_string(pm_parser_t *parser, bool interpolation, bool label_allowed * called when we're at the end of the file. We want the parser to be able to * perform its normal error tolerance. */ -static inline bool +static PRISM_INLINE bool lex_mode_push_string_eof(pm_parser_t *parser) { return lex_mode_push_string(parser, false, false, '\0', '\0'); } @@ -241,7 +375,7 @@ lex_mode_pop(pm_parser_t *parser) { } else { parser->lex_modes.index--; pm_lex_mode_t *prev = parser->lex_modes.current->prev; - xfree(parser->lex_modes.current); + xfree_sized(parser->lex_modes.current, sizeof(pm_lex_mode_t)); parser->lex_modes.current = prev; } } @@ -249,7 +383,7 @@ lex_mode_pop(pm_parser_t *parser) { /** * This is the equivalent of IS_lex_state is CRuby. */ -static inline bool +static PRISM_INLINE bool lex_state_p(const pm_parser_t *parser, pm_lex_state_t state) { return parser->lex_state & state; } @@ -260,7 +394,7 @@ typedef enum { PM_IGNORED_NEWLINE_PATTERN } pm_ignored_newline_type_t; -static inline pm_ignored_newline_type_t +static PRISM_INLINE pm_ignored_newline_type_t lex_state_ignored_p(pm_parser_t *parser) { bool ignored = lex_state_p(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_CLASS | PM_LEX_STATE_FNAME | PM_LEX_STATE_DOT) && !lex_state_p(parser, PM_LEX_STATE_LABELED); @@ -273,17 +407,17 @@ lex_state_ignored_p(pm_parser_t *parser) { } } -static inline bool +static PRISM_INLINE bool lex_state_beg_p(pm_parser_t *parser) { return lex_state_p(parser, PM_LEX_STATE_BEG_ANY) || ((parser->lex_state & (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)) == (PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED)); } -static inline bool +static PRISM_INLINE bool lex_state_arg_p(pm_parser_t *parser) { return lex_state_p(parser, PM_LEX_STATE_ARG_ANY); } -static inline bool +static PRISM_INLINE bool lex_state_spcarg_p(pm_parser_t *parser, bool space_seen) { if (parser->current.end >= parser->end) { return false; @@ -291,7 +425,7 @@ lex_state_spcarg_p(pm_parser_t *parser, bool space_seen) { return lex_state_arg_p(parser) && space_seen && !pm_char_is_whitespace(*parser->current.end); } -static inline bool +static PRISM_INLINE bool lex_state_end_p(pm_parser_t *parser) { return lex_state_p(parser, PM_LEX_STATE_END_ANY); } @@ -299,7 +433,7 @@ lex_state_end_p(pm_parser_t *parser) { /** * This is the equivalent of IS_AFTER_OPERATOR in CRuby. */ -static inline bool +static PRISM_INLINE bool lex_state_operator_p(pm_parser_t *parser) { return lex_state_p(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_DOT); } @@ -308,7 +442,7 @@ lex_state_operator_p(pm_parser_t *parser) { * Set the state of the lexer. This is defined as a function to be able to put a * breakpoint in it. */ -static inline void +static PRISM_INLINE void lex_state_set(pm_parser_t *parser, pm_lex_state_t state) { parser->lex_state = state; } @@ -322,7 +456,7 @@ lex_state_set(pm_parser_t *parser, pm_lex_state_t state) { #endif #if PM_DEBUG_LOGGING -PRISM_ATTRIBUTE_UNUSED static void +PRISM_UNUSED static void debug_state(pm_parser_t *parser) { fprintf(stderr, "STATE: "); bool first = true; @@ -403,140 +537,134 @@ debug_lex_state_set(pm_parser_t *parser, pm_lex_state_t state, char const * call /** * Append an error to the list of errors on the parser. */ -static inline void -pm_parser_err(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { - pm_diagnostic_list_append(&parser->error_list, start, end, diag_id); +static PRISM_INLINE void +pm_parser_err(pm_parser_t *parser, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id) { + pm_diagnostic_list_append(&parser->metadata_arena, &parser->error_list, start, length, diag_id); } /** - * Append an error to the list of errors on the parser using a format string. + * Append an error to the list of errors on the parser using the location of the + * given token. */ -#define PM_PARSER_ERR_FORMAT(parser, start, end, diag_id, ...) \ - pm_diagnostic_list_append_format(&parser->error_list, start, end, diag_id, __VA_ARGS__) +static PRISM_INLINE void +pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) { + pm_parser_err(parser, PM_TOKEN_START(parser, token), PM_TOKEN_LENGTH(token), diag_id); +} /** * Append an error to the list of errors on the parser using the location of the * current token. */ -static inline void +static PRISM_INLINE void pm_parser_err_current(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { - pm_parser_err(parser, parser->current.start, parser->current.end, diag_id); + pm_parser_err_token(parser, &parser->current, diag_id); } /** - * Append an error to the list of errors on the parser using the given location - * using a format string. + * Append an error to the list of errors on the parser using the location of the + * previous token. */ -#define PM_PARSER_ERR_LOCATION_FORMAT(parser, location, diag_id, ...) \ - PM_PARSER_ERR_FORMAT(parser, (location)->start, (location)->end, diag_id, __VA_ARGS__) +static PRISM_INLINE void +pm_parser_err_previous(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { + pm_parser_err_token(parser, &parser->previous, diag_id); +} /** * Append an error to the list of errors on the parser using the location of the * given node. */ -static inline void +static PRISM_INLINE void pm_parser_err_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) { - pm_parser_err(parser, node->location.start, node->location.end, diag_id); + pm_parser_err(parser, PM_NODE_START(node), PM_NODE_LENGTH(node), diag_id); } /** - * Append an error to the list of errors on the parser using the location of the - * given node and a format string. - */ -#define PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, ...) \ - PM_PARSER_ERR_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__) - -/** - * Append an error to the list of errors on the parser using the location of the - * given node and a format string, and add on the content of the node. + * Append an error to the list of errors on the parser using a format string. */ -#define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, node, diag_id) \ - PM_PARSER_ERR_NODE_FORMAT(parser, node, diag_id, (int) ((node)->location.end - (node)->location.start), (const char *) (node)->location.start) +#define PM_PARSER_ERR_FORMAT(parser_, start_, length_, diag_id_, ...) \ + pm_diagnostic_list_append_format(&(parser_)->metadata_arena, &(parser_)->error_list, start_, length_, diag_id_, __VA_ARGS__) /** * Append an error to the list of errors on the parser using the location of the - * previous token. + * given node and a format string. */ -static inline void -pm_parser_err_previous(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { - pm_parser_err(parser, parser->previous.start, parser->previous.end, diag_id); -} +#define PM_PARSER_ERR_NODE_FORMAT(parser_, node_, diag_id_, ...) \ + PM_PARSER_ERR_FORMAT(parser_, PM_NODE_START(node_), PM_NODE_LENGTH(node_), diag_id_, __VA_ARGS__) /** * Append an error to the list of errors on the parser using the location of the - * given token. + * given node and a format string, and add on the content of the node. */ -static inline void -pm_parser_err_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) { - pm_parser_err(parser, token->start, token->end, diag_id); -} +#define PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser_, node_, diag_id_) \ + PM_PARSER_ERR_NODE_FORMAT(parser_, node_, diag_id_, (int) PM_NODE_LENGTH(node_), (const char *) (parser_->start + PM_NODE_START(node_))) /** * Append an error to the list of errors on the parser using the location of the * given token and a format string. */ -#define PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, ...) \ - PM_PARSER_ERR_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__) +#define PM_PARSER_ERR_TOKEN_FORMAT(parser_, token_, diag_id, ...) \ + PM_PARSER_ERR_FORMAT(parser_, PM_TOKEN_START(parser_, token_), PM_TOKEN_LENGTH(token_), diag_id, __VA_ARGS__) /** * Append an error to the list of errors on the parser using the location of the * given token and a format string, and add on the content of the token. */ -#define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \ - PM_PARSER_ERR_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start) +#define PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser_, token_, diag_id_) \ + PM_PARSER_ERR_TOKEN_FORMAT(parser_, token_, diag_id_, (int) PM_TOKEN_LENGTH(token_), (const char *) (token_)->start) /** * Append a warning to the list of warnings on the parser. */ -static inline void -pm_parser_warn(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { - pm_diagnostic_list_append(&parser->warning_list, start, end, diag_id); +static PRISM_INLINE void +pm_parser_warn(pm_parser_t *parser, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id) { + pm_diagnostic_list_append(&parser->metadata_arena, &parser->warning_list, start, length, diag_id); } /** * Append a warning to the list of warnings on the parser using the location of * the given token. */ -static inline void +static PRISM_INLINE void pm_parser_warn_token(pm_parser_t *parser, const pm_token_t *token, pm_diagnostic_id_t diag_id) { - pm_parser_warn(parser, token->start, token->end, diag_id); + pm_parser_warn(parser, PM_TOKEN_START(parser, token), PM_TOKEN_LENGTH(token), diag_id); } /** * Append a warning to the list of warnings on the parser using the location of * the given node. */ -static inline void +static PRISM_INLINE void pm_parser_warn_node(pm_parser_t *parser, const pm_node_t *node, pm_diagnostic_id_t diag_id) { - pm_parser_warn(parser, node->location.start, node->location.end, diag_id); + pm_parser_warn(parser, PM_NODE_START(node), PM_NODE_LENGTH(node), diag_id); } /** - * Append a warning to the list of warnings on the parser using a format string. + * Append a warning to the list of warnings on the parser using a format string + * and the given location. */ -#define PM_PARSER_WARN_FORMAT(parser, start, end, diag_id, ...) \ - pm_diagnostic_list_append_format(&parser->warning_list, start, end, diag_id, __VA_ARGS__) +#define PM_PARSER_WARN_FORMAT(parser_, start_, length_, diag_id_, ...) \ + pm_diagnostic_list_append_format(&(parser_)->metadata_arena, &(parser_)->warning_list, start_, length_, diag_id_, __VA_ARGS__) /** * Append a warning to the list of warnings on the parser using the location of * the given token and a format string. */ -#define PM_PARSER_WARN_TOKEN_FORMAT(parser, token, diag_id, ...) \ - PM_PARSER_WARN_FORMAT(parser, (token).start, (token).end, diag_id, __VA_ARGS__) +#define PM_PARSER_WARN_TOKEN_FORMAT(parser_, token_, diag_id_, ...) \ + PM_PARSER_WARN_FORMAT(parser_, PM_TOKEN_START(parser_, token_), PM_TOKEN_LENGTH(token_), diag_id_, __VA_ARGS__) /** * Append a warning to the list of warnings on the parser using the location of * the given token and a format string, and add on the content of the token. */ -#define PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, token, diag_id) \ - PM_PARSER_WARN_TOKEN_FORMAT(parser, token, diag_id, (int) ((token).end - (token).start), (const char *) (token).start) +#define PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser_, token_, diag_id_) \ + PM_PARSER_WARN_TOKEN_FORMAT(parser_, token_, diag_id_, (int) PM_TOKEN_LENGTH(token_), (const char *) (token_)->start) /** * Append a warning to the list of warnings on the parser using the location of * the given node and a format string. */ -#define PM_PARSER_WARN_NODE_FORMAT(parser, node, diag_id, ...) \ - PM_PARSER_WARN_FORMAT(parser, (node)->location.start, (node)->location.end, diag_id, __VA_ARGS__) +#define PM_PARSER_WARN_NODE_FORMAT(parser_, node_, diag_id_, ...) \ + PM_PARSER_WARN_FORMAT(parser_, PM_NODE_START(node_), PM_NODE_LENGTH(node_), diag_id_, __VA_ARGS__) /** * Add an error for an expected heredoc terminator. This is a special function @@ -547,8 +675,8 @@ static void pm_parser_err_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ident_length) { PM_PARSER_ERR_FORMAT( parser, - ident_start, - ident_start + ident_length, + U32(ident_start - parser->start), + U32(ident_length), PM_ERR_HEREDOC_TERM, (int) ident_length, (const char *) ident_start @@ -708,7 +836,7 @@ pm_parser_scope_forwarding_keywords_check(pm_parser_t *parser, const pm_token_t /** * Get the current state of constant shareability. */ -static inline pm_shareable_constant_value_t +static PRISM_INLINE pm_shareable_constant_value_t pm_parser_scope_shareable_constant_get(pm_parser_t *parser) { return parser->current_scope->shareable_constant; } @@ -733,12 +861,12 @@ pm_parser_scope_shareable_constant_set(pm_parser_t *parser, pm_shareable_constan /** * The point at which the set of locals switches from being a list to a hash. */ -#define PM_LOCALS_HASH_THRESHOLD 9 +#define PM_LOCALS_HASH_THRESHOLD 5 static void pm_locals_free(pm_locals_t *locals) { if (locals->capacity > 0) { - xfree(locals->locals); + xfree_sized(locals->locals, locals->capacity * sizeof(pm_local_t)); } } @@ -810,11 +938,13 @@ pm_locals_resize(pm_locals_t *locals) { * @return True if the local was added, and false if the local already exists. */ static bool -pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, const uint8_t *start, const uint8_t *end, uint32_t reads) { +pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, uint32_t start, uint32_t length, uint32_t reads) { if (locals->size >= (locals->capacity / 4 * 3)) { pm_locals_resize(locals); } + locals->bloom |= (1u << (name & 31)); + if (locals->capacity < PM_LOCALS_HASH_THRESHOLD) { for (uint32_t index = 0; index < locals->capacity; index++) { pm_local_t *local = &locals->locals[index]; @@ -822,7 +952,7 @@ pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, const uint8_t *start if (local->name == PM_CONSTANT_ID_UNSET) { *local = (pm_local_t) { .name = name, - .location = { .start = start, .end = end }, + .location = { .start = start, .length = length }, .index = locals->size++, .reads = reads, .hash = 0 @@ -843,7 +973,7 @@ pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, const uint8_t *start if (local->name == PM_CONSTANT_ID_UNSET) { *local = (pm_local_t) { .name = name, - .location = { .start = start, .end = end }, + .location = { .start = start, .length = length }, .index = locals->size++, .reads = reads, .hash = initial_hash @@ -867,6 +997,8 @@ pm_locals_write(pm_locals_t *locals, pm_constant_id_t name, const uint8_t *start */ static uint32_t pm_locals_find(pm_locals_t *locals, pm_constant_id_t name) { + if (!(locals->bloom & (1u << (name & 31)))) return UINT32_MAX; + if (locals->capacity < PM_LOCALS_HASH_THRESHOLD) { for (uint32_t index = 0; index < locals->size; index++) { pm_local_t *local = &locals->locals[index]; @@ -943,8 +1075,8 @@ pm_locals_reads(pm_locals_t *locals, pm_constant_id_t name) { * written but not read in certain contexts. */ static void -pm_locals_order(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, pm_locals_t *locals, pm_constant_id_list_t *list, bool toplevel) { - pm_constant_id_list_init_capacity(list, locals->size); +pm_locals_order(pm_parser_t *parser, pm_locals_t *locals, pm_constant_id_list_t *list, bool toplevel) { + pm_constant_id_list_init_capacity(parser->arena, list, locals->size); // If we're still below the threshold for switching to a hash, then we only // need to loop over the locals until we hit the size because the locals are @@ -961,14 +1093,14 @@ pm_locals_order(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, pm_locals_t *locals, if (local->name != PM_CONSTANT_ID_UNSET) { pm_constant_id_list_insert(list, (size_t) local->index, local->name); - if (warn_unused && local->reads == 0 && ((parser->start_line >= 0) || (pm_newline_list_line(&parser->newline_list, local->location.start, parser->start_line) >= 0))) { + if (warn_unused && local->reads == 0 && ((parser->start_line >= 0) || (pm_line_offset_list_line(&parser->line_offsets, local->location.start, parser->start_line) >= 0))) { pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, local->name); if (constant->length >= 1 && *constant->start != '_') { PM_PARSER_WARN_FORMAT( parser, local->location.start, - local->location.end, + local->location.length, PM_WARN_UNUSED_LOCAL_VARIABLE, (int) constant->length, (const char *) constant->start @@ -986,43 +1118,53 @@ pm_locals_order(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, pm_locals_t *locals, /** * Retrieve the constant pool id for the given location. */ -static inline pm_constant_id_t -pm_parser_constant_id_location(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { - return pm_constant_pool_insert_shared(&parser->constant_pool, start, (size_t) (end - start)); +static PRISM_INLINE pm_constant_id_t +pm_parser_constant_id_raw(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { + /* Fast path: if this is the same token as the last lookup (same pointer + * range), return the cached result. */ + if (start == parser->constant_cache.start && end == parser->constant_cache.end) { + return parser->constant_cache.id; + } + + pm_constant_id_t id = pm_constant_pool_insert_shared(&parser->metadata_arena, &parser->constant_pool, start, (size_t) (end - start)); + + parser->constant_cache.start = start; + parser->constant_cache.end = end; + parser->constant_cache.id = id; + + return id; } /** * Retrieve the constant pool id for the given string. */ -static inline pm_constant_id_t +static PRISM_INLINE pm_constant_id_t pm_parser_constant_id_owned(pm_parser_t *parser, uint8_t *start, size_t length) { - return pm_constant_pool_insert_owned(&parser->constant_pool, start, length); + return pm_constant_pool_insert_owned(&parser->metadata_arena, &parser->constant_pool, start, length); } /** * Retrieve the constant pool id for the given static literal C string. */ -static inline pm_constant_id_t +static PRISM_INLINE pm_constant_id_t pm_parser_constant_id_constant(pm_parser_t *parser, const char *start, size_t length) { - return pm_constant_pool_insert_constant(&parser->constant_pool, (const uint8_t *) start, length); + return pm_constant_pool_insert_constant(&parser->metadata_arena, &parser->constant_pool, (const uint8_t *) start, length); } /** * Retrieve the constant pool id for the given token. */ -static inline pm_constant_id_t +static PRISM_INLINE pm_constant_id_t pm_parser_constant_id_token(pm_parser_t *parser, const pm_token_t *token) { - return pm_parser_constant_id_location(parser, token->start, token->end); + return pm_parser_constant_id_raw(parser, token->start, token->end); } /** - * Retrieve the constant pool id for the given token. If the token is not - * provided, then return 0. + * This macro allows you to define a case statement for all of the nodes that + * may result in a void value. */ -static inline pm_constant_id_t -pm_parser_optional_constant_id_token(pm_parser_t *parser, const pm_token_t *token) { - return token->type == PM_TOKEN_NOT_PROVIDED ? 0 : pm_parser_constant_id_token(parser, token); -} +#define PM_CASE_VOID_VALUE PM_RETURN_NODE: case PM_BREAK_NODE: case PM_NEXT_NODE: \ + case PM_REDO_NODE: case PM_RETRY_NODE: case PM_MATCH_REQUIRED_NODE /** * Check whether or not the given node is value expression. @@ -1035,12 +1177,7 @@ pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { while (node != NULL) { switch (PM_NODE_TYPE(node)) { - case PM_RETURN_NODE: - case PM_BREAK_NODE: - case PM_NEXT_NODE: - case PM_REDO_NODE: - case PM_RETRY_NODE: - case PM_MATCH_REQUIRED_NODE: + case PM_CASE_VOID_VALUE: return void_node != NULL ? void_node : node; case PM_MATCH_PREDICATE_NODE: return NULL; @@ -1049,57 +1186,128 @@ pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { if (cast->ensure_clause != NULL) { if (cast->rescue_clause != NULL) { - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) cast->rescue_clause); + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->rescue_clause)); if (vn != NULL) return vn; } if (cast->statements != NULL) { - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) cast->statements); + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); if (vn != NULL) return vn; } - node = (pm_node_t *) cast->ensure_clause; + node = UP(cast->ensure_clause); } else if (cast->rescue_clause != NULL) { - if (cast->statements == NULL) return NULL; + // https://bugs.ruby-lang.org/issues/21669 + if (cast->else_clause == NULL || parser->version < PM_OPTIONS_VERSION_CRUBY_4_1) { + if (cast->statements == NULL) return NULL; - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) cast->statements); - if (vn == NULL) return NULL; - if (void_node == NULL) void_node = vn; + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) return NULL; + if (void_node == NULL) void_node = vn; + } for (pm_rescue_node_t *rescue_clause = cast->rescue_clause; rescue_clause != NULL; rescue_clause = rescue_clause->subsequent) { - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) rescue_clause->statements); + pm_node_t *vn = pm_check_value_expression(parser, UP(rescue_clause->statements)); + if (vn == NULL) { + // https://bugs.ruby-lang.org/issues/21669 + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1) { + return NULL; + } void_node = NULL; break; } - if (void_node == NULL) { - void_node = vn; - } } if (cast->else_clause != NULL) { - node = (pm_node_t *) cast->else_clause; + node = UP(cast->else_clause); + + // https://bugs.ruby-lang.org/issues/21669 + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1) { + pm_node_t *vn = pm_check_value_expression(parser, node); + if (vn != NULL) return vn; + } } else { return void_node; } } else { - node = (pm_node_t *) cast->statements; + node = UP(cast->statements); } break; } + case PM_CASE_NODE: { + // https://bugs.ruby-lang.org/issues/21669 + if (parser->version < PM_OPTIONS_VERSION_CRUBY_4_1) { + return NULL; + } + + pm_case_node_t *cast = (pm_case_node_t *) node; + if (cast->else_clause == NULL) return NULL; + + pm_node_t *condition; + PM_NODE_LIST_FOREACH(&cast->conditions, index, condition) { + assert(PM_NODE_TYPE_P(condition, PM_WHEN_NODE)); + + pm_when_node_t *cast = (pm_when_node_t *) condition; + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) return NULL; + if (void_node == NULL) void_node = vn; + } + + node = UP(cast->else_clause); + break; + } + case PM_CASE_MATCH_NODE: { + // https://bugs.ruby-lang.org/issues/21669 + if (parser->version < PM_OPTIONS_VERSION_CRUBY_4_1) { + return NULL; + } + + pm_case_match_node_t *cast = (pm_case_match_node_t *) node; + if (cast->else_clause == NULL) return NULL; + + pm_node_t *condition; + PM_NODE_LIST_FOREACH(&cast->conditions, index, condition) { + assert(PM_NODE_TYPE_P(condition, PM_IN_NODE)); + + pm_in_node_t *cast = (pm_in_node_t *) condition; + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); + if (vn == NULL) return NULL; + if (void_node == NULL) void_node = vn; + } + + node = UP(cast->else_clause); + break; + } case PM_ENSURE_NODE: { pm_ensure_node_t *cast = (pm_ensure_node_t *) node; - node = (pm_node_t *) cast->statements; + node = UP(cast->statements); break; } case PM_PARENTHESES_NODE: { pm_parentheses_node_t *cast = (pm_parentheses_node_t *) node; - node = (pm_node_t *) cast->body; + node = UP(cast->body); break; } case PM_STATEMENTS_NODE: { pm_statements_node_t *cast = (pm_statements_node_t *) node; + + // https://bugs.ruby-lang.org/issues/21669 + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1) { + pm_node_t *body_part; + PM_NODE_LIST_FOREACH(&cast->body, index, body_part) { + switch (PM_NODE_TYPE(body_part)) { + case PM_CASE_VOID_VALUE: + if (void_node == NULL) { + void_node = body_part; + } + return void_node; + default: break; + } + } + } + node = cast->body.nodes[cast->body.size - 1]; break; } @@ -1108,7 +1316,7 @@ pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { if (cast->statements == NULL || cast->subsequent == NULL) { return NULL; } - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) cast->statements); + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); if (vn == NULL) { return NULL; } @@ -1123,19 +1331,19 @@ pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { if (cast->statements == NULL || cast->else_clause == NULL) { return NULL; } - pm_node_t *vn = pm_check_value_expression(parser, (pm_node_t *) cast->statements); + pm_node_t *vn = pm_check_value_expression(parser, UP(cast->statements)); if (vn == NULL) { return NULL; } if (void_node == NULL) { void_node = vn; } - node = (pm_node_t *) cast->else_clause; + node = UP(cast->else_clause); break; } case PM_ELSE_NODE: { pm_else_node_t *cast = (pm_else_node_t *) node; - node = (pm_node_t *) cast->statements; + node = UP(cast->statements); break; } case PM_AND_NODE: { @@ -1165,7 +1373,7 @@ pm_check_value_expression(pm_parser_t *parser, pm_node_t *node) { return NULL; } -static inline void +static PRISM_INLINE void pm_assert_value_expression(pm_parser_t *parser, pm_node_t *node) { pm_node_t *void_node = pm_check_value_expression(parser, node); if (void_node != NULL) { @@ -1193,7 +1401,7 @@ pm_void_statement_check(pm_parser_t *parser, const pm_node_t *node) { break; case PM_CALL_NODE: { const pm_call_node_t *cast = (const pm_call_node_t *) node; - if (cast->call_operator_loc.start != NULL || cast->message_loc.start == NULL) break; + if (cast->call_operator_loc.length > 0 || cast->message_loc.length == 0) break; const pm_constant_t *message = pm_constant_pool_id_to_constant(&parser->constant_pool, cast->name); switch (message->length) { @@ -1406,10 +1614,10 @@ pm_conditional_predicate_warn_write_literal_p(const pm_node_t *node) { * Add a warning to the parser if the value that is being written inside of a * predicate to a conditional is a literal. */ -static inline void +static PRISM_INLINE void pm_conditional_predicate_warn_write_literal(pm_parser_t *parser, const pm_node_t *node) { if (pm_conditional_predicate_warn_write_literal_p(node)) { - pm_parser_warn_node(parser, node, parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_WARN_EQUAL_IN_CONDITIONAL_3_3 : PM_WARN_EQUAL_IN_CONDITIONAL); + pm_parser_warn_node(parser, node, parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_WARN_EQUAL_IN_CONDITIONAL_3_3 : PM_WARN_EQUAL_IN_CONDITIONAL); } } @@ -1547,26 +1755,6 @@ pm_conditional_predicate(pm_parser_t *parser, pm_node_t *node, pm_conditional_pr } /** - * In a lot of places in the tree you can have tokens that are not provided but - * that do not cause an error. For example, this happens in a method call - * without parentheses. In these cases we set the token to the "not provided" type. - * For example: - * - * pm_token_t token = not_provided(parser); - */ -static inline pm_token_t -not_provided(pm_parser_t *parser) { - return (pm_token_t) { .type = PM_TOKEN_NOT_PROVIDED, .start = parser->start, .end = parser->start }; -} - -#define PM_LOCATION_NULL_VALUE(parser) ((pm_location_t) { .start = (parser)->start, .end = (parser)->start }) -#define PM_LOCATION_TOKEN_VALUE(token) ((pm_location_t) { .start = (token)->start, .end = (token)->end }) -#define PM_LOCATION_NODE_VALUE(node) ((pm_location_t) { .start = (node)->location.start, .end = (node)->location.end }) -#define PM_LOCATION_NODE_BASE_VALUE(node) ((pm_location_t) { .start = (node)->base.location.start, .end = (node)->base.location.end }) -#define PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE ((pm_location_t) { .start = NULL, .end = NULL }) -#define PM_OPTIONAL_LOCATION_TOKEN_VALUE(token) ((token)->type == PM_TOKEN_NOT_PROVIDED ? PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE : PM_LOCATION_TOKEN_VALUE(token)) - -/** * This is a special out parameter to the parse_arguments_list function that * includes opening and closing parentheses in addition to the arguments since * it's so common. It is handy to use when passing argument information to one @@ -1592,22 +1780,29 @@ typedef struct { /** * Retrieve the end location of a `pm_arguments_t` object. */ -static inline const uint8_t * +static PRISM_INLINE const pm_location_t * pm_arguments_end(pm_arguments_t *arguments) { if (arguments->block != NULL) { - const uint8_t *end = arguments->block->location.end; - if (arguments->closing_loc.start != NULL && arguments->closing_loc.end > end) { - end = arguments->closing_loc.end; + uint32_t end = PM_NODE_END(arguments->block); + + if (arguments->closing_loc.length > 0) { + uint32_t arguments_end = PM_LOCATION_END(&arguments->closing_loc); + if (arguments_end > end) { + return &arguments->closing_loc; + } } - return end; + return &arguments->block->location; } - if (arguments->closing_loc.start != NULL) { - return arguments->closing_loc.end; + if (arguments->closing_loc.length > 0) { + return &arguments->closing_loc; } if (arguments->arguments != NULL) { - return arguments->arguments->base.location.end; + return &arguments->arguments->base.location; + } + if (arguments->opening_loc.length > 0) { + return &arguments->opening_loc; } - return arguments->closing_loc.end; + return NULL; } /** @@ -1618,7 +1813,7 @@ static void pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_block_node_t *block) { // First, check that we have arguments and that we don't have a closing // location for them. - if (arguments->arguments == NULL || arguments->closing_loc.start != NULL) { + if (arguments->arguments == NULL || arguments->closing_loc.length > 0) { return; } @@ -1635,7 +1830,7 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b // If we didn't hit a case before this check, then at this point we need to // add a syntax error. - pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_UNEXPECTED_BLOCK); + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_UNEXPECTED_BLOCK); } /******************************************************************************/ @@ -1648,7 +1843,7 @@ pm_arguments_validate_block(pm_parser_t *parser, pm_arguments_t *arguments, pm_b * reason we have the encoding_changed boolean to check if we need to go through * the function pointer or can just directly use the UTF-8 functions. */ -static inline size_t +static PRISM_INLINE size_t char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) { if (n <= 0) return 0; @@ -1675,7 +1870,7 @@ char_is_identifier_start(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t * Similar to char_is_identifier but this function assumes that the encoding * has not been changed. */ -static inline size_t +static PRISM_INLINE size_t char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) { if (n <= 0) { return 0; @@ -1687,11 +1882,189 @@ char_is_identifier_utf8(const uint8_t *b, ptrdiff_t n) { } /** + * Scan forward through ASCII identifier characters (a-z, A-Z, 0-9, _) using + * wide operations. Returns the number of leading ASCII identifier bytes. + * Callers must handle any remaining bytes (short tail or non-ASCII/UTF-8) + * with a byte-at-a-time loop. + * + * Up to three optimized implementations are selected at compile time, with a + * no-op fallback for unsupported platforms: + * 1. NEON — processes 16 bytes per iteration on aarch64. + * 2. SSSE3 — processes 16 bytes per iteration on x86-64. + * 3. SWAR — little-endian fallback, processes 8 bytes per iteration. + */ + +#if defined(PRISM_HAS_NEON) +#include <arm_neon.h> + +static PRISM_INLINE size_t +scan_identifier_ascii(const uint8_t *start, const uint8_t *end) { + const uint8_t *cursor = start; + + // Nibble-based lookup tables for classifying [a-zA-Z0-9_]. + // Each high nibble is assigned a unique bit; the low nibble table + // contains the OR of bits for all high nibbles that have an + // identifier character at that low nibble position. A byte is an + // identifier character iff (low_lut[lo] & high_lut[hi]) != 0. + static const uint8_t low_lut_data[16] = { + 0x15, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, + 0x1F, 0x1F, 0x1E, 0x0A, 0x0A, 0x0A, 0x0A, 0x0E + }; + static const uint8_t high_lut_data[16] = { + 0x00, 0x00, 0x00, 0x01, 0x02, 0x04, 0x08, 0x10, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + }; + const uint8x16_t low_lut = vld1q_u8(low_lut_data); + const uint8x16_t high_lut = vld1q_u8(high_lut_data); + const uint8x16_t mask_0f = vdupq_n_u8(0x0F); + + while (cursor + 16 <= end) { + uint8x16_t v = vld1q_u8(cursor); + + uint8x16_t lo_class = vqtbl1q_u8(low_lut, vandq_u8(v, mask_0f)); + uint8x16_t hi_class = vqtbl1q_u8(high_lut, vshrq_n_u8(v, 4)); + uint8x16_t ident = vandq_u8(lo_class, hi_class); + + // Fast check: if the per-byte minimum is nonzero, every byte matched. + if (vminvq_u8(ident) != 0) { + cursor += 16; + continue; + } + + // Find the first non-identifier byte (zero in ident). + uint8x16_t is_zero = vceqq_u8(ident, vdupq_n_u8(0)); + uint64_t lo = vgetq_lane_u64(vreinterpretq_u64_u8(is_zero), 0); + + if (lo != 0) { + cursor += pm_ctzll(lo) / 8; + } else { + uint64_t hi = vgetq_lane_u64(vreinterpretq_u64_u8(is_zero), 1); + cursor += 8 + pm_ctzll(hi) / 8; + } + + return (size_t) (cursor - start); + } + + return (size_t) (cursor - start); +} + +#elif defined(PRISM_HAS_SSSE3) +#include <tmmintrin.h> + +static PRISM_INLINE size_t +scan_identifier_ascii(const uint8_t *start, const uint8_t *end) { + const uint8_t *cursor = start; + + while (cursor + 16 <= end) { + __m128i v = _mm_loadu_si128((const __m128i *) cursor); + __m128i zero = _mm_setzero_si128(); + + // Unsigned range check via saturating subtraction: + // byte >= lo ⟺ saturate(lo - byte) == 0 + // byte <= hi ⟺ saturate(byte - hi) == 0 + + // Fold case: OR with 0x20 maps A-Z to a-z. + __m128i lowered = _mm_or_si128(v, _mm_set1_epi8(0x20)); + __m128i letter = _mm_and_si128( + _mm_cmpeq_epi8(_mm_subs_epu8(_mm_set1_epi8(0x61), lowered), zero), + _mm_cmpeq_epi8(_mm_subs_epu8(lowered, _mm_set1_epi8(0x7A)), zero)); + + __m128i digit = _mm_and_si128( + _mm_cmpeq_epi8(_mm_subs_epu8(_mm_set1_epi8(0x30), v), zero), + _mm_cmpeq_epi8(_mm_subs_epu8(v, _mm_set1_epi8(0x39)), zero)); + + __m128i underscore = _mm_cmpeq_epi8(v, _mm_set1_epi8(0x5F)); + + __m128i ident = _mm_or_si128(_mm_or_si128(letter, digit), underscore); + int mask = _mm_movemask_epi8(ident); + + if (mask == 0xFFFF) { + cursor += 16; + continue; + } + + cursor += pm_ctzll((uint64_t) (~mask & 0xFFFF)); + return (size_t) (cursor - start); + } + + return (size_t) (cursor - start); +} + +// The SWAR path uses pm_ctzll to find the first non-matching byte within a +// word, which only yields the correct byte index on little-endian targets. +// We gate on a positive little-endian check so that unknown-endianness +// platforms safely fall through to the no-op fallback. +#elif defined(PRISM_HAS_SWAR) + +/** + * Portable SWAR fallback — processes 8 bytes per iteration. + * + * The byte-wise range checks avoid cross-byte borrows by pre-setting the high + * bit of each byte before subtraction: (byte | 0x80) - lo has a minimum value + * of 0x80 - 0x7F = 1, so underflow (and thus a borrow into the next byte) is + * impossible. The result has bit 7 set if and only if byte >= lo. The same + * reasoning applies to the upper-bound direction. + */ +static PRISM_INLINE size_t +scan_identifier_ascii(const uint8_t *start, const uint8_t *end) { + static const uint64_t ones = 0x0101010101010101ULL; + static const uint64_t highs = 0x8080808080808080ULL; + const uint8_t *cursor = start; + + while (cursor + 8 <= end) { + uint64_t word; + memcpy(&word, cursor, 8); + + // Bail on any non-ASCII byte. + if (word & highs) break; + + uint64_t digit = ((word | highs) - ones * 0x30) & ((ones * 0x39 | highs) - word) & highs; + + // Fold upper- and lowercase together by forcing bit 5 (OR 0x20), + // then check the lowercase range once. A-Z maps to a-z; the + // only non-letter byte that could alias into [0x61,0x7A] is one + // whose original value was in [0x41,0x5A] — which is exactly + // the uppercase letters we want to match. + uint64_t lowered = word | (ones * 0x20); + uint64_t letter = ((lowered | highs) - ones * 0x61) & ((ones * 0x7A | highs) - lowered) & highs; + + // Standard SWAR "has zero byte" idiom on (word XOR 0x5F) to find + // bytes equal to underscore. Safe from cross-byte borrows because + // the ASCII guard above ensures all bytes are < 0x80. + uint64_t xor_us = word ^ (ones * 0x5F); + uint64_t underscore = (xor_us - ones) & ~xor_us & highs; + + uint64_t ident = digit | letter | underscore; + + if (ident == highs) { + cursor += 8; + continue; + } + + // Find the first non-identifier byte. On little-endian the first + // byte sits in the least-significant position. + uint64_t not_ident = ~ident & highs; + cursor += pm_ctzll(not_ident) / 8; + return (size_t) (cursor - start); + } + + return (size_t) (cursor - start); +} + +#else + +// No-op fallback for big-endian or other unsupported platforms. +// The caller's byte-at-a-time loop handles everything. +#define scan_identifier_ascii(start, end) ((size_t) 0) + +#endif + +/** * Like the above, this function is also used extremely frequently to lex all of * the identifiers in a source file once the first character has been found. So * it's important that it be as fast as possible. */ -static inline size_t +static PRISM_INLINE size_t char_is_identifier(const pm_parser_t *parser, const uint8_t *b, ptrdiff_t n) { if (n <= 0) { return 0; @@ -1729,7 +2102,7 @@ const unsigned int pm_global_name_punctuation_hash[(0x7e - 0x20 + 31) / 32] = { #undef BIT #undef PUNCT -static inline bool +static PRISM_INLINE bool char_is_global_name_punctuation(const uint8_t b) { const unsigned int i = (const unsigned int) b; if (i <= 0x20 || 0x7e < i) return false; @@ -1737,7 +2110,7 @@ char_is_global_name_punctuation(const uint8_t b) { return (pm_global_name_punctuation_hash[(i - 0x20) / 32] >> (i % 32)) & 1; } -static inline bool +static PRISM_INLINE bool token_is_setter_name(pm_token_t *token) { return ( (token->type == PM_TOKEN_BRACKET_LEFT_RIGHT_EQUAL) || @@ -1825,7 +2198,7 @@ pm_local_is_keyword(const char *source, size_t length) { /** * Set the given flag on the given node. */ -static inline void +static PRISM_INLINE void pm_node_flag_set(pm_node_t *node, pm_node_flags_t flag) { node->flags |= flag; } @@ -1833,7 +2206,7 @@ pm_node_flag_set(pm_node_t *node, pm_node_flags_t flag) { /** * Remove the given flag from the given node. */ -static inline void +static PRISM_INLINE void pm_node_flag_unset(pm_node_t *node, pm_node_flags_t flag) { node->flags &= (pm_node_flags_t) ~flag; } @@ -1841,7 +2214,7 @@ pm_node_flag_unset(pm_node_t *node, pm_node_flags_t flag) { /** * Set the repeated parameter flag on the given node. */ -static inline void +static PRISM_INLINE void pm_node_flag_set_repeated_parameter(pm_node_t *node) { assert(PM_NODE_TYPE(node) == PM_BLOCK_LOCAL_VARIABLE_NODE || PM_NODE_TYPE(node) == PM_BLOCK_PARAMETER_NODE || @@ -1869,7 +2242,7 @@ pm_node_flag_set_repeated_parameter(pm_node_t *node) { /** * Parse out the options for a regular expression. */ -static inline pm_node_flags_t +static PRISM_INLINE pm_node_flags_t pm_regular_expression_flags_create(pm_parser_t *parser, const pm_token_t *closing) { pm_node_flags_t flags = 0; @@ -1895,9 +2268,9 @@ pm_regular_expression_flags_create(pm_parser_t *parser, const pm_token_t *closin size_t unknown_flags_length = pm_buffer_length(&unknown_flags); if (unknown_flags_length != 0) { const char *word = unknown_flags_length >= 2 ? "options" : "option"; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_REGEXP_UNKNOWN_OPTIONS, word, unknown_flags_length, pm_buffer_value(&unknown_flags)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, PM_ERR_REGEXP_UNKNOWN_OPTIONS, word, unknown_flags_length, pm_buffer_value(&unknown_flags)); } - pm_buffer_free(&unknown_flags); + pm_buffer_cleanup(&unknown_flags); } return flags; @@ -1915,36 +2288,45 @@ static size_t pm_statements_node_body_length(pm_statements_node_t *node); /** - * This function is here to allow us a place to extend in the future when we - * implement our own arena allocation. + * Move an integer's values array into the arena. If the integer has heap- + * allocated values, copy them to the arena and free the original. */ -static inline void * -pm_node_alloc(PRISM_ATTRIBUTE_UNUSED pm_parser_t *parser, size_t size) { - void *memory = xcalloc(1, size); - if (memory == NULL) { - fprintf(stderr, "Failed to allocate %d bytes\n", (int) size); - abort(); +static PRISM_INLINE void +pm_integer_arena_move(pm_arena_t *arena, pm_integer_t *integer) { + if (integer->values != NULL) { + size_t byte_size = integer->length * sizeof(uint32_t); + uint32_t *old_values = integer->values; + integer->values = (uint32_t *) pm_arena_memdup(arena, old_values, byte_size, PRISM_ALIGNOF(uint32_t)); + xfree(old_values); } - return memory; } -#define PM_NODE_ALLOC(parser, type) (type *) pm_node_alloc(parser, sizeof(type)) -#define PM_NODE_IDENTIFY(parser) (++parser->node_id) - /** - * Allocate a new MissingNode node. + * Allocate a new ErrorRecoveryNode node with no unexpected child. */ -static pm_missing_node_t * -pm_missing_node_create(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { - pm_missing_node_t *node = PM_NODE_ALLOC(parser, pm_missing_node_t); - - *node = (pm_missing_node_t) {{ - .type = PM_MISSING_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = start, .end = end } - }}; +static pm_error_recovery_node_t * +pm_error_recovery_node_create(pm_parser_t *parser, uint32_t start, uint32_t length) { + return pm_error_recovery_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = length }), + NULL + ); +} - return node; +/** + * Allocate a new ErrorRecoveryNode node wrapping an unexpected child node. + */ +static pm_error_recovery_node_t * +pm_error_recovery_node_create_unexpected(pm_parser_t *parser, pm_node_t *unexpected) { + return pm_error_recovery_node_new( + parser->arena, + ++parser->node_id, + 0, + unexpected->location, + unexpected + ); } /** @@ -1953,23 +2335,16 @@ pm_missing_node_create(pm_parser_t *parser, const uint8_t *start, const uint8_t static pm_alias_global_variable_node_t * pm_alias_global_variable_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *new_name, pm_node_t *old_name) { assert(keyword->type == PM_TOKEN_KEYWORD_ALIAS); - pm_alias_global_variable_node_t *node = PM_NODE_ALLOC(parser, pm_alias_global_variable_node_t); - - *node = (pm_alias_global_variable_node_t) { - { - .type = PM_ALIAS_GLOBAL_VARIABLE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = old_name->location.end - }, - }, - .new_name = new_name, - .old_name = old_name, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) - }; - return node; + return pm_alias_global_variable_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, old_name), + new_name, + old_name, + TOK2LOC(parser, keyword) + ); } /** @@ -1978,23 +2353,16 @@ pm_alias_global_variable_node_create(pm_parser_t *parser, const pm_token_t *keyw static pm_alias_method_node_t * pm_alias_method_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *new_name, pm_node_t *old_name) { assert(keyword->type == PM_TOKEN_KEYWORD_ALIAS); - pm_alias_method_node_t *node = PM_NODE_ALLOC(parser, pm_alias_method_node_t); - *node = (pm_alias_method_node_t) { - { - .type = PM_ALIAS_METHOD_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = old_name->location.end - }, - }, - .new_name = new_name, - .old_name = old_name, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) - }; - - return node; + return pm_alias_method_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, old_name), + new_name, + old_name, + TOK2LOC(parser, keyword) + ); } /** @@ -2002,23 +2370,15 @@ pm_alias_method_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_n */ static pm_alternation_pattern_node_t * pm_alternation_pattern_node_create(pm_parser_t *parser, pm_node_t *left, pm_node_t *right, const pm_token_t *operator) { - pm_alternation_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_alternation_pattern_node_t); - - *node = (pm_alternation_pattern_node_t) { - { - .type = PM_ALTERNATION_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = left->location.start, - .end = right->location.end - }, - }, - .left = left, - .right = right, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_alternation_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(left, right), + left, + right, + TOK2LOC(parser, operator) + ); } /** @@ -2028,23 +2388,15 @@ static pm_and_node_t * pm_and_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { pm_assert_value_expression(parser, left); - pm_and_node_t *node = PM_NODE_ALLOC(parser, pm_and_node_t); - - *node = (pm_and_node_t) { - { - .type = PM_AND_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = left->location.start, - .end = right->location.end - }, - }, - .left = left, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .right = right - }; - - return node; + return pm_and_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(left, right), + left, + right, + TOK2LOC(parser, operator) + ); } /** @@ -2052,18 +2404,13 @@ pm_and_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *opera */ static pm_arguments_node_t * pm_arguments_node_create(pm_parser_t *parser) { - pm_arguments_node_t *node = PM_NODE_ALLOC(parser, pm_arguments_node_t); - - *node = (pm_arguments_node_t) { - { - .type = PM_ARGUMENTS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .arguments = { 0 } - }; - - return node; + return pm_arguments_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }) + ); } /** @@ -2078,19 +2425,22 @@ pm_arguments_node_size(pm_arguments_node_t *node) { * Append an argument to an arguments node. */ static void -pm_arguments_node_arguments_append(pm_arguments_node_t *node, pm_node_t *argument) { +pm_arguments_node_arguments_append(pm_arena_t *arena, pm_arguments_node_t *node, pm_node_t *argument) { if (pm_arguments_node_size(node) == 0) { - node->base.location.start = argument->location.start; + PM_NODE_START_SET_NODE(node, argument); } - node->base.location.end = argument->location.end; - pm_node_list_append(&node->arguments, argument); + if (PM_NODE_END(node) < PM_NODE_END(argument)) { + PM_NODE_LENGTH_SET_NODE(node, argument); + } + + pm_node_list_append(arena, &node->arguments, argument); if (PM_NODE_TYPE_P(argument, PM_SPLAT_NODE)) { if (PM_NODE_FLAG_P(node, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT)) { - pm_node_flag_set((pm_node_t *) node, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_MULTIPLE_SPLATS); + pm_node_flag_set(UP(node), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_MULTIPLE_SPLATS); } else { - pm_node_flag_set((pm_node_t *) node, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT); + pm_node_flag_set(UP(node), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT); } } } @@ -2100,43 +2450,49 @@ pm_arguments_node_arguments_append(pm_arguments_node_t *node, pm_node_t *argumen */ static pm_array_node_t * pm_array_node_create(pm_parser_t *parser, const pm_token_t *opening) { - pm_array_node_t *node = PM_NODE_ALLOC(parser, pm_array_node_t); - - *node = (pm_array_node_t) { - { - .type = PM_ARRAY_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(opening) - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .elements = { 0 } - }; - - return node; + if (opening == NULL) { + return pm_array_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); + } else { + return pm_array_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, opening), + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, opening), + TOK2LOC(parser, opening) + ); + } } /** * Append an argument to an array node. */ -static inline void -pm_array_node_elements_append(pm_array_node_t *node, pm_node_t *element) { - if (!node->elements.size && !node->opening_loc.start) { - node->base.location.start = element->location.start; +static PRISM_INLINE void +pm_array_node_elements_append(pm_arena_t *arena, pm_array_node_t *node, pm_node_t *element) { + if (!node->elements.size && !node->opening_loc.length) { + PM_NODE_START_SET_NODE(node, element); } - pm_node_list_append(&node->elements, element); - node->base.location.end = element->location.end; + pm_node_list_append(arena, &node->elements, element); + PM_NODE_LENGTH_SET_NODE(node, element); // If the element is not a static literal, then the array is not a static // literal. Turn that flag off. if (PM_NODE_TYPE_P(element, PM_ARRAY_NODE) || PM_NODE_TYPE_P(element, PM_HASH_NODE) || PM_NODE_TYPE_P(element, PM_RANGE_NODE) || !PM_NODE_FLAG_P(element, PM_NODE_FLAG_STATIC_LITERAL)) { - pm_node_flag_unset((pm_node_t *)node, PM_NODE_FLAG_STATIC_LITERAL); + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); } if (PM_NODE_TYPE_P(element, PM_SPLAT_NODE)) { - pm_node_flag_set((pm_node_t *)node, PM_ARRAY_NODE_FLAGS_CONTAINS_SPLAT); + pm_node_flag_set(UP(node), PM_ARRAY_NODE_FLAGS_CONTAINS_SPLAT); } } @@ -2144,10 +2500,10 @@ pm_array_node_elements_append(pm_array_node_t *node, pm_node_t *element) { * Set the closing token and end location of an array node. */ static void -pm_array_node_close_set(pm_array_node_t *node, const pm_token_t *closing) { - assert(closing->type == PM_TOKEN_BRACKET_RIGHT || closing->type == PM_TOKEN_STRING_END || closing->type == PM_TOKEN_MISSING || closing->type == PM_TOKEN_NOT_PROVIDED); - node->base.location.end = closing->end; - node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); +pm_array_node_close_set(const pm_parser_t *parser, pm_array_node_t *node, const pm_token_t *closing) { + assert(closing->type == PM_TOKEN_BRACKET_RIGHT || closing->type == PM_TOKEN_STRING_END || closing->type == 0); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); + node->closing_loc = TOK2LOC(parser, closing); } /** @@ -2156,24 +2512,18 @@ pm_array_node_close_set(pm_array_node_t *node, const pm_token_t *closing) { */ static pm_array_pattern_node_t * pm_array_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *nodes) { - pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); - - *node = (pm_array_pattern_node_t) { - { - .type = PM_ARRAY_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = nodes->nodes[0]->location.start, - .end = nodes->nodes[nodes->size - 1]->location.end - }, - }, - .constant = NULL, - .rest = NULL, - .requireds = { 0 }, - .posts = { 0 }, - .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; + pm_array_pattern_node_t *node = pm_array_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(nodes->nodes[0], nodes->nodes[nodes->size - 1]), + NULL, + ((pm_node_list_t) { 0 }), + NULL, + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); // For now we're going to just copy over each pointer manually. This could be // much more efficient, as we could instead resize the node list. @@ -2185,9 +2535,9 @@ pm_array_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *node node->rest = child; found_rest = true; } else if (found_rest) { - pm_node_list_append(&node->posts, child); + pm_node_list_append(parser->arena, &node->posts, child); } else { - pm_node_list_append(&node->requireds, child); + pm_node_list_append(parser->arena, &node->requireds, child); } } @@ -2199,23 +2549,18 @@ pm_array_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *node */ static pm_array_pattern_node_t * pm_array_pattern_node_rest_create(pm_parser_t *parser, pm_node_t *rest) { - pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); - - *node = (pm_array_pattern_node_t) { - { - .type = PM_ARRAY_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = rest->location, - }, - .constant = NULL, - .rest = rest, - .requireds = { 0 }, - .posts = { 0 }, - .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + return pm_array_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODE(rest), + NULL, + ((pm_node_list_t) { 0 }), + rest, + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); } /** @@ -2224,26 +2569,18 @@ pm_array_pattern_node_rest_create(pm_parser_t *parser, pm_node_t *rest) { */ static pm_array_pattern_node_t * pm_array_pattern_node_constant_create(pm_parser_t *parser, pm_node_t *constant, const pm_token_t *opening, const pm_token_t *closing) { - pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); - - *node = (pm_array_pattern_node_t) { - { - .type = PM_ARRAY_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = constant->location.start, - .end = closing->end - }, - }, - .constant = constant, - .rest = NULL, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .requireds = { 0 }, - .posts = { 0 } - }; - - return node; + return pm_array_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODE_TOKEN(parser, constant, closing), + constant, + ((pm_node_list_t) { 0 }), + NULL, + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -2252,31 +2589,23 @@ pm_array_pattern_node_constant_create(pm_parser_t *parser, pm_node_t *constant, */ static pm_array_pattern_node_t * pm_array_pattern_node_empty_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { - pm_array_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_array_pattern_node_t); - - *node = (pm_array_pattern_node_t) { - { - .type = PM_ARRAY_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - }, - }, - .constant = NULL, - .rest = NULL, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .requireds = { 0 }, - .posts = { 0 } - }; - - return node; + return pm_array_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + NULL, + ((pm_node_list_t) { 0 }), + NULL, + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } -static inline void -pm_array_pattern_node_requireds_append(pm_array_pattern_node_t *node, pm_node_t *inner) { - pm_node_list_append(&node->requireds, inner); +static PRISM_INLINE void +pm_array_pattern_node_requireds_append(pm_arena_t *arena, pm_array_pattern_node_t *node, pm_node_t *inner) { + pm_node_list_append(arena, &node->requireds, inner); } /** @@ -2284,15 +2613,14 @@ pm_array_pattern_node_requireds_append(pm_array_pattern_node_t *node, pm_node_t */ static pm_assoc_node_t * pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *operator, pm_node_t *value) { - pm_assoc_node_t *node = PM_NODE_ALLOC(parser, pm_assoc_node_t); - const uint8_t *end; + uint32_t end; - if (value != NULL && value->location.end > key->location.end) { - end = value->location.end; - } else if (operator->type != PM_TOKEN_NOT_PROVIDED) { - end = operator->end; + if (value != NULL && PM_NODE_END(value) > PM_NODE_END(key)) { + end = PM_NODE_END(value); + } else if (operator != NULL) { + end = PM_TOKEN_END(parser, operator); } else { - end = key->location.end; + end = PM_NODE_END(key); } // Hash string keys will be frozen, so we can mark them as frozen here so @@ -2312,22 +2640,15 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper flags = key->flags & value->flags & PM_NODE_FLAG_STATIC_LITERAL; } - *node = (pm_assoc_node_t) { - { - .type = PM_ASSOC_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = key->location.start, - .end = end - }, - }, - .key = key, - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_assoc_node_new( + parser->arena, + ++parser->node_id, + flags, + ((pm_location_t) { .start = PM_NODE_START(key), .length = U32(end - PM_NODE_START(key)) }), + key, + value, + NTOK2LOC(parser, operator) + ); } /** @@ -2336,22 +2657,15 @@ pm_assoc_node_create(pm_parser_t *parser, pm_node_t *key, const pm_token_t *oper static pm_assoc_splat_node_t * pm_assoc_splat_node_create(pm_parser_t *parser, pm_node_t *value, const pm_token_t *operator) { assert(operator->type == PM_TOKEN_USTAR_STAR); - pm_assoc_splat_node_t *node = PM_NODE_ALLOC(parser, pm_assoc_splat_node_t); - - *node = (pm_assoc_splat_node_t) { - { - .type = PM_ASSOC_SPLAT_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = value == NULL ? operator->end : value->location.end - }, - }, - .value = value, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - return node; + return pm_assoc_splat_node_new( + parser->arena, + ++parser->node_id, + 0, + (value == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKEN_NODE(parser, operator, value), + value, + TOK2LOC(parser, operator) + ); } /** @@ -2360,18 +2674,14 @@ pm_assoc_splat_node_create(pm_parser_t *parser, pm_node_t *value, const pm_token static pm_back_reference_read_node_t * pm_back_reference_read_node_create(pm_parser_t *parser, const pm_token_t *name) { assert(name->type == PM_TOKEN_BACK_REFERENCE); - pm_back_reference_read_node_t *node = PM_NODE_ALLOC(parser, pm_back_reference_read_node_t); - *node = (pm_back_reference_read_node_t) { - { - .type = PM_BACK_REFERENCE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name), - }, - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + return pm_back_reference_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_parser_constant_id_token(parser, name) + ); } /** @@ -2379,23 +2689,21 @@ pm_back_reference_read_node_create(pm_parser_t *parser, const pm_token_t *name) */ static pm_begin_node_t * pm_begin_node_create(pm_parser_t *parser, const pm_token_t *begin_keyword, pm_statements_node_t *statements) { - pm_begin_node_t *node = PM_NODE_ALLOC(parser, pm_begin_node_t); - - *node = (pm_begin_node_t) { - { - .type = PM_BEGIN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = begin_keyword->start, - .end = statements == NULL ? begin_keyword->end : statements->base.location.end - }, - }, - .begin_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(begin_keyword), - .statements = statements, - .end_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + uint32_t start = begin_keyword == NULL ? 0 : PM_TOKEN_START(parser, begin_keyword); + uint32_t end = statements == NULL ? (begin_keyword == NULL ? 0 : PM_TOKEN_END(parser, begin_keyword)) : PM_NODE_END(statements); + + return pm_begin_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NTOK2LOC(parser, begin_keyword), + statements, + NULL, + NULL, + NULL, + ((pm_location_t) { 0 }) + ); } /** @@ -2403,11 +2711,10 @@ pm_begin_node_create(pm_parser_t *parser, const pm_token_t *begin_keyword, pm_st */ static void pm_begin_node_rescue_clause_set(pm_begin_node_t *node, pm_rescue_node_t *rescue_clause) { - // If the begin keyword doesn't exist, we set the start on the begin_node - if (!node->begin_keyword_loc.start) { - node->base.location.start = rescue_clause->base.location.start; + if (node->begin_keyword_loc.length == 0) { + PM_NODE_START_SET_NODE(node, rescue_clause); } - node->base.location.end = rescue_clause->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, rescue_clause); node->rescue_clause = rescue_clause; } @@ -2416,7 +2723,10 @@ pm_begin_node_rescue_clause_set(pm_begin_node_t *node, pm_rescue_node_t *rescue_ */ static void pm_begin_node_else_clause_set(pm_begin_node_t *node, pm_else_node_t *else_clause) { - node->base.location.end = else_clause->base.location.end; + if ((node->begin_keyword_loc.length == 0) && PM_NODE_START(node) == 0) { + PM_NODE_START_SET_NODE(node, else_clause); + } + PM_NODE_LENGTH_SET_NODE(node, else_clause); node->else_clause = else_clause; } @@ -2425,7 +2735,10 @@ pm_begin_node_else_clause_set(pm_begin_node_t *node, pm_else_node_t *else_clause */ static void pm_begin_node_ensure_clause_set(pm_begin_node_t *node, pm_ensure_node_t *ensure_clause) { - node->base.location.end = ensure_clause->base.location.end; + if ((node->begin_keyword_loc.length == 0) && PM_NODE_START(node) == 0) { + PM_NODE_START_SET_NODE(node, ensure_clause); + } + PM_NODE_LENGTH_SET_NODE(node, ensure_clause); node->ensure_clause = ensure_clause; } @@ -2433,11 +2746,10 @@ pm_begin_node_ensure_clause_set(pm_begin_node_t *node, pm_ensure_node_t *ensure_ * Set the end keyword and end location of a begin node. */ static void -pm_begin_node_end_keyword_set(pm_begin_node_t *node, const pm_token_t *end_keyword) { - assert(end_keyword->type == PM_TOKEN_KEYWORD_END || end_keyword->type == PM_TOKEN_MISSING); - - node->base.location.end = end_keyword->end; - node->end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword); +pm_begin_node_end_keyword_set(const pm_parser_t *parser, pm_begin_node_t *node, const pm_token_t *end_keyword) { + assert(end_keyword->type == PM_TOKEN_KEYWORD_END || end_keyword->type == 0); + PM_NODE_LENGTH_SET_TOKEN(parser, node, end_keyword); + node->end_keyword_loc = TOK2LOC(parser, end_keyword); } /** @@ -2445,22 +2757,16 @@ pm_begin_node_end_keyword_set(pm_begin_node_t *node, const pm_token_t *end_keywo */ static pm_block_argument_node_t * pm_block_argument_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *expression) { - pm_block_argument_node_t *node = PM_NODE_ALLOC(parser, pm_block_argument_node_t); - - *node = (pm_block_argument_node_t) { - { - .type = PM_BLOCK_ARGUMENT_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = expression == NULL ? operator->end : expression->location.end - }, - }, - .expression = expression, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + assert(operator->type == PM_TOKEN_UAMPERSAND); + + return pm_block_argument_node_new( + parser->arena, + ++parser->node_id, + 0, + (expression == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKEN_NODE(parser, operator, expression), + expression, + TOK2LOC(parser, operator) + ); } /** @@ -2468,22 +2774,17 @@ pm_block_argument_node_create(pm_parser_t *parser, const pm_token_t *operator, p */ static pm_block_node_t * pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *opening, pm_node_t *parameters, pm_node_t *body, const pm_token_t *closing) { - pm_block_node_t *node = PM_NODE_ALLOC(parser, pm_block_node_t); - - *node = (pm_block_node_t) { - { - .type = PM_BLOCK_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = opening->start, .end = closing->end }, - }, - .locals = *locals, - .parameters = parameters, - .body = body, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) - }; - - return node; + return pm_block_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + *locals, + parameters, + body, + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -2491,24 +2792,17 @@ pm_block_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const p */ static pm_block_parameter_node_t * pm_block_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, const pm_token_t *operator) { - assert(operator->type == PM_TOKEN_NOT_PROVIDED || operator->type == PM_TOKEN_UAMPERSAND || operator->type == PM_TOKEN_AMPERSAND); - pm_block_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_block_parameter_node_t); - - *node = (pm_block_parameter_node_t) { - { - .type = PM_BLOCK_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = (name->type == PM_TOKEN_NOT_PROVIDED ? operator->end : name->end) - }, - }, - .name = pm_parser_optional_constant_id_token(parser, name), - .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + assert(operator->type == PM_TOKEN_UAMPERSAND || operator->type == PM_TOKEN_AMPERSAND); + + return pm_block_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + (name == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKENS(parser, operator, name), + name == NULL ? 0 : pm_parser_constant_id_token(parser, name), + NTOK2LOC(parser, name), + TOK2LOC(parser, operator) + ); } /** @@ -2516,53 +2810,44 @@ pm_block_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, cons */ static pm_block_parameters_node_t * pm_block_parameters_node_create(pm_parser_t *parser, pm_parameters_node_t *parameters, const pm_token_t *opening) { - pm_block_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_block_parameters_node_t); - - const uint8_t *start; - if (opening->type != PM_TOKEN_NOT_PROVIDED) { - start = opening->start; + uint32_t start; + if (opening != NULL) { + start = PM_TOKEN_START(parser, opening); } else if (parameters != NULL) { - start = parameters->base.location.start; + start = PM_NODE_START(parameters); } else { - start = NULL; + start = 0; } - const uint8_t *end; + uint32_t end; if (parameters != NULL) { - end = parameters->base.location.end; - } else if (opening->type != PM_TOKEN_NOT_PROVIDED) { - end = opening->end; + end = PM_NODE_END(parameters); + } else if (opening != NULL) { + end = PM_TOKEN_END(parser, opening); } else { - end = NULL; - } - - *node = (pm_block_parameters_node_t) { - { - .type = PM_BLOCK_PARAMETERS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = start, - .end = end - } - }, - .parameters = parameters, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .locals = { 0 } - }; - - return node; + end = 0; + } + + return pm_block_parameters_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + parameters, + ((pm_node_list_t) { 0 }), + NTOK2LOC(parser, opening), + ((pm_location_t) { 0 }) + ); } /** * Set the closing location of a BlockParametersNode node. */ static void -pm_block_parameters_node_closing_set(pm_block_parameters_node_t *node, const pm_token_t *closing) { - assert(closing->type == PM_TOKEN_PIPE || closing->type == PM_TOKEN_PARENTHESIS_RIGHT || closing->type == PM_TOKEN_MISSING); - - node->base.location.end = closing->end; - node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); +pm_block_parameters_node_closing_set(const pm_parser_t *parser, pm_block_parameters_node_t *node, const pm_token_t *closing) { + assert(closing->type == PM_TOKEN_PIPE || closing->type == PM_TOKEN_PARENTHESIS_RIGHT || closing->type == 0); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); + node->closing_loc = TOK2LOC(parser, closing); } /** @@ -2570,29 +2855,27 @@ pm_block_parameters_node_closing_set(pm_block_parameters_node_t *node, const pm_ */ static pm_block_local_variable_node_t * pm_block_local_variable_node_create(pm_parser_t *parser, const pm_token_t *name) { - pm_block_local_variable_node_t *node = PM_NODE_ALLOC(parser, pm_block_local_variable_node_t); - - *node = (pm_block_local_variable_node_t) { - { - .type = PM_BLOCK_LOCAL_VARIABLE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name), - }, - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + return pm_block_local_variable_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_parser_constant_id_token(parser, name) + ); } /** * Append a new block-local variable to a BlockParametersNode node. */ static void -pm_block_parameters_node_append_local(pm_block_parameters_node_t *node, const pm_block_local_variable_node_t *local) { - pm_node_list_append(&node->locals, (pm_node_t *) local); +pm_block_parameters_node_append_local(pm_arena_t *arena, pm_block_parameters_node_t *node, const pm_block_local_variable_node_t *local) { + pm_node_list_append(arena, &node->locals, UP(local)); + + if (PM_NODE_LENGTH(node) == 0) { + PM_NODE_START_SET_NODE(node, local); + } - if (node->base.location.start == NULL) node->base.location.start = local->base.location.start; - node->base.location.end = local->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, local); } /** @@ -2601,66 +2884,55 @@ pm_block_parameters_node_append_local(pm_block_parameters_node_t *node, const pm static pm_break_node_t * pm_break_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { assert(keyword->type == PM_TOKEN_KEYWORD_BREAK); - pm_break_node_t *node = PM_NODE_ALLOC(parser, pm_break_node_t); - *node = (pm_break_node_t) { - { - .type = PM_BREAK_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = (arguments == NULL ? keyword->end : arguments->base.location.end) - }, - }, - .arguments = arguments, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) - }; - - return node; + return pm_break_node_new( + parser->arena, + ++parser->node_id, + 0, + (arguments == NULL) ? PM_LOCATION_INIT_TOKEN(parser, keyword) : PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, arguments), + arguments, + TOK2LOC(parser, keyword) + ); } // There are certain flags that we want to use internally but don't want to // expose because they are not relevant beyond parsing. Therefore we'll define // them here and not define them in config.yml/a header file. -static const pm_node_flags_t PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY = 0x4; -static const pm_node_flags_t PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY = 0x40; -static const pm_node_flags_t PM_CALL_NODE_FLAGS_COMPARISON = 0x80; -static const pm_node_flags_t PM_CALL_NODE_FLAGS_INDEX = 0x100; +static const pm_node_flags_t PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY = (1 << 2); + +static const pm_node_flags_t PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY = ((PM_CALL_NODE_FLAGS_LAST - 1) << 1); +static const pm_node_flags_t PM_CALL_NODE_FLAGS_COMPARISON = ((PM_CALL_NODE_FLAGS_LAST - 1) << 2); +static const pm_node_flags_t PM_CALL_NODE_FLAGS_INDEX = ((PM_CALL_NODE_FLAGS_LAST - 1) << 3); /** - * Allocate and initialize a new CallNode node. This sets everything to NULL or - * PM_TOKEN_NOT_PROVIDED as appropriate such that its values can be overridden - * in the various specializations of this function. + * Allocate and initialize a new CallNode node. This sets everything to NULL + * such that its values can be overridden in the various specializations of this + * function. */ static pm_call_node_t * pm_call_node_create(pm_parser_t *parser, pm_node_flags_t flags) { - pm_call_node_t *node = PM_NODE_ALLOC(parser, pm_call_node_t); - - *node = (pm_call_node_t) { - { - .type = PM_CALL_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser), - }, - .receiver = NULL, - .call_operator_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .message_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .arguments = NULL, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .block = NULL, - .name = 0 - }; - - return node; + return pm_call_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_UNSET, + NULL, + ((pm_location_t) { 0 }), + 0, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + NULL, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + NULL + ); } /** * Returns the value that the ignore visibility flag should be set to for the * given receiver. */ -static inline pm_node_flags_t +static PRISM_INLINE pm_node_flags_t pm_call_node_ignore_visibility_flag(const pm_node_t *receiver) { return PM_NODE_TYPE_P(receiver, PM_SELF_NODE) ? PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY : 0; } @@ -2680,12 +2952,15 @@ pm_call_node_aref_create(pm_parser_t *parser, pm_node_t *receiver, pm_arguments_ pm_call_node_t *node = pm_call_node_create(parser, flags); - node->base.location.start = receiver->location.start; - node->base.location.end = pm_arguments_end(arguments); + PM_NODE_START_SET_NODE(node, receiver); + + const pm_location_t *end = pm_arguments_end(arguments); + assert(end != NULL && "unreachable"); + PM_NODE_LENGTH_SET_LOCATION(node, end); node->receiver = receiver; node->message_loc.start = arguments->opening_loc.start; - node->message_loc.end = arguments->closing_loc.end; + node->message_loc.length = (arguments->closing_loc.start + arguments->closing_loc.length) - arguments->opening_loc.start; node->opening_loc = arguments->opening_loc; node->arguments = arguments->arguments; @@ -2706,20 +2981,22 @@ pm_call_node_binary_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver) | flags); - node->base.location.start = MIN(receiver->location.start, argument->location.start); - node->base.location.end = MAX(receiver->location.end, argument->location.end); + PM_NODE_START_SET_NODE(node, PM_NODE_START(receiver) < PM_NODE_START(argument) ? receiver : argument); + PM_NODE_LENGTH_SET_NODE(node, PM_NODE_END(receiver) > PM_NODE_END(argument) ? receiver : argument); node->receiver = receiver; - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + node->message_loc = TOK2LOC(parser, operator); pm_arguments_node_t *arguments = pm_arguments_node_create(parser); - pm_arguments_node_arguments_append(arguments, argument); + pm_arguments_node_arguments_append(parser->arena, arguments, argument); node->arguments = arguments; node->name = pm_parser_constant_id_token(parser, operator); return node; } +static const uint8_t * parse_operator_symbol_name(const pm_token_t *); + /** * Allocate and initialize a new CallNode node from a call expression. */ @@ -2729,26 +3006,31 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); - node->base.location.start = receiver->location.start; - const uint8_t *end = pm_arguments_end(arguments); + PM_NODE_START_SET_NODE(node, receiver); + const pm_location_t *end = pm_arguments_end(arguments); if (end == NULL) { - end = message->end; + PM_NODE_LENGTH_SET_TOKEN(parser, node, message); + } else { + PM_NODE_LENGTH_SET_LOCATION(node, end); } - node->base.location.end = end; node->receiver = receiver; - node->call_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->call_operator_loc = TOK2LOC(parser, operator); + node->message_loc = TOK2LOC(parser, message); node->opening_loc = arguments->opening_loc; node->arguments = arguments->arguments; node->closing_loc = arguments->closing_loc; node->block = arguments->block; if (operator->type == PM_TOKEN_AMPERSAND_DOT) { - pm_node_flag_set((pm_node_t *)node, PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); + pm_node_flag_set(UP(node), PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); } - node->name = pm_parser_constant_id_token(parser, message); + /** + * If the final character is `@` as is the case for `foo.~@`, + * we should ignore the @ in the same way we do for symbols. + */ + node->name = pm_parser_constant_id_raw(parser, message->start, parse_operator_symbol_name(message)); return node; } @@ -2758,12 +3040,9 @@ pm_call_node_call_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *o static pm_call_node_t * pm_call_node_call_synthesized_create(pm_parser_t *parser, pm_node_t *receiver, const char *message, pm_arguments_node_t *arguments) { pm_call_node_t *node = pm_call_node_create(parser, 0); - node->base.location.start = parser->start; - node->base.location.end = parser->end; + node->base.location = (pm_location_t) { .start = 0, .length = U32(parser->end - parser->start) }; node->receiver = receiver; - node->call_operator_loc = (pm_location_t) { .start = NULL, .end = NULL }; - node->message_loc = (pm_location_t) { .start = NULL, .end = NULL }; node->arguments = arguments; node->name = pm_parser_constant_id_constant(parser, message, strlen(message)); @@ -2778,10 +3057,12 @@ static pm_call_node_t * pm_call_node_fcall_create(pm_parser_t *parser, pm_token_t *message, pm_arguments_t *arguments) { pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); - node->base.location.start = message->start; - node->base.location.end = pm_arguments_end(arguments); + PM_NODE_START_SET_TOKEN(parser, node, message); + const pm_location_t *end = pm_arguments_end(arguments); + assert(end != NULL && "unreachable"); + PM_NODE_LENGTH_SET_LOCATION(node, end); - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->message_loc = TOK2LOC(parser, message); node->opening_loc = arguments->opening_loc; node->arguments = arguments->arguments; node->closing_loc = arguments->closing_loc; @@ -2799,7 +3080,7 @@ static pm_call_node_t * pm_call_node_fcall_synthesized_create(pm_parser_t *parser, pm_arguments_node_t *arguments, pm_constant_id_t name) { pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); - node->base.location = PM_LOCATION_NULL_VALUE(parser); + node->base.location = (pm_location_t) { 0 }; node->arguments = arguments; node->name = name; @@ -2816,16 +3097,16 @@ pm_call_node_not_create(pm_parser_t *parser, pm_node_t *receiver, pm_token_t *me pm_call_node_t *node = pm_call_node_create(parser, receiver == NULL ? 0 : pm_call_node_ignore_visibility_flag(receiver)); - node->base.location.start = message->start; - if (arguments->closing_loc.start != NULL) { - node->base.location.end = arguments->closing_loc.end; + PM_NODE_START_SET_TOKEN(parser, node, message); + if (arguments->closing_loc.length > 0) { + PM_NODE_LENGTH_SET_LOCATION(node, &arguments->closing_loc); } else { assert(receiver != NULL); - node->base.location.end = receiver->location.end; + PM_NODE_LENGTH_SET_NODE(node, receiver); } node->receiver = receiver; - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->message_loc = TOK2LOC(parser, message); node->opening_loc = arguments->opening_loc; node->arguments = arguments->arguments; node->closing_loc = arguments->closing_loc; @@ -2843,18 +3124,20 @@ pm_call_node_shorthand_create(pm_parser_t *parser, pm_node_t *receiver, pm_token pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); - node->base.location.start = receiver->location.start; - node->base.location.end = pm_arguments_end(arguments); + PM_NODE_START_SET_NODE(node, receiver); + const pm_location_t *end = pm_arguments_end(arguments); + assert(end != NULL && "unreachable"); + PM_NODE_LENGTH_SET_LOCATION(node, end); node->receiver = receiver; - node->call_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + node->call_operator_loc = TOK2LOC(parser, operator); node->opening_loc = arguments->opening_loc; node->arguments = arguments->arguments; node->closing_loc = arguments->closing_loc; node->block = arguments->block; if (operator->type == PM_TOKEN_AMPERSAND_DOT) { - pm_node_flag_set((pm_node_t *)node, PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); + pm_node_flag_set(UP(node), PM_CALL_NODE_FLAGS_SAFE_NAVIGATION); } node->name = pm_parser_constant_id_constant(parser, "call", 4); @@ -2870,11 +3153,11 @@ pm_call_node_unary_create(pm_parser_t *parser, pm_token_t *operator, pm_node_t * pm_call_node_t *node = pm_call_node_create(parser, pm_call_node_ignore_visibility_flag(receiver)); - node->base.location.start = operator->start; - node->base.location.end = receiver->location.end; + PM_NODE_START_SET_TOKEN(parser, node, operator); + PM_NODE_LENGTH_SET_NODE(node, receiver); node->receiver = receiver; - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); + node->message_loc = TOK2LOC(parser, operator); node->name = pm_parser_constant_id_constant(parser, name, strlen(name)); return node; @@ -2888,8 +3171,8 @@ static pm_call_node_t * pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) { pm_call_node_t *node = pm_call_node_create(parser, PM_CALL_NODE_FLAGS_IGNORE_VISIBILITY); - node->base.location = PM_LOCATION_TOKEN_VALUE(message); - node->message_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(message); + node->base.location = TOK2LOC(parser, message); + node->message_loc = TOK2LOC(parser, message); node->name = pm_parser_constant_id_token(parser, message); return node; @@ -2899,14 +3182,14 @@ pm_call_node_variable_call_create(pm_parser_t *parser, pm_token_t *message) { * Returns whether or not this call can be used on the left-hand side of an * operator assignment. */ -static inline bool +static PRISM_INLINE bool pm_call_node_writable_p(const pm_parser_t *parser, const pm_call_node_t *node) { return ( - (node->message_loc.start != NULL) && - (node->message_loc.end[-1] != '!') && - (node->message_loc.end[-1] != '?') && - char_is_identifier_start(parser, node->message_loc.start, parser->end - node->message_loc.start) && - (node->opening_loc.start == NULL) && + (node->message_loc.length > 0) && + (parser->start[node->message_loc.start + node->message_loc.length - 1] != '!') && + (parser->start[node->message_loc.start + node->message_loc.length - 1] != '?') && + char_is_identifier_start(parser, parser->start + node->message_loc.start, (ptrdiff_t) node->message_loc.length) && + (node->opening_loc.length == 0) && (node->arguments == NULL) && (node->block == NULL) ); @@ -2922,10 +3205,10 @@ pm_call_write_read_name_init(pm_parser_t *parser, pm_constant_id_t *read_name, p if (write_constant->length > 0) { size_t length = write_constant->length - 1; - void *memory = xmalloc(length); + uint8_t *memory = (uint8_t *) pm_arena_alloc(parser->arena, length, 1); memcpy(memory, write_constant->start, length); - *read_name = pm_constant_pool_insert_owned(&parser->constant_pool, (uint8_t *) memory, length); + *read_name = pm_constant_pool_insert_owned(&parser->metadata_arena, &parser->constant_pool, memory, length); } else { // We can get here if the message was missing because of a syntax error. *read_name = pm_parser_constant_id_constant(parser, "", 0); @@ -2939,33 +3222,25 @@ static pm_call_and_write_node_t * pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(target->block == NULL); assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_call_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_and_write_node_t); - *node = (pm_call_and_write_node_t) { - { - .type = PM_CALL_AND_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .message_loc = target->message_loc, - .read_name = 0, - .write_name = target->name, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; + pm_call_and_write_node_t *node = pm_call_and_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->message_loc, + 0, + target->name, + TOK2LOC(parser, operator), + value + ); pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -2976,7 +3251,7 @@ pm_call_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const */ static void pm_index_arguments_check(pm_parser_t *parser, const pm_arguments_node_t *arguments, const pm_node_t *block) { - if (parser->version != PM_OPTIONS_VERSION_CRUBY_3_3) { + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) { if (arguments != NULL && PM_NODE_FLAG_P(arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS)) { pm_node_t *node; PM_NODE_LIST_FOREACH(&arguments->arguments, index, node) { @@ -2999,35 +3274,28 @@ pm_index_arguments_check(pm_parser_t *parser, const pm_arguments_node_t *argumen static pm_index_and_write_node_t * pm_index_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_index_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_and_write_node_t); pm_index_arguments_check(parser, target->arguments, target->block); assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); - *node = (pm_index_and_write_node_t) { - { - .type = PM_INDEX_AND_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .opening_loc = target->opening_loc, - .arguments = target->arguments, - .closing_loc = target->closing_loc, - .block = (pm_block_argument_node_t *) target->block, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + pm_index_and_write_node_t *node = pm_index_and_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->opening_loc, + target->arguments, + target->closing_loc, + (pm_block_argument_node_t *) target->block, + TOK2LOC(parser, operator), + value + ); + + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3038,34 +3306,26 @@ pm_index_and_write_node_create(pm_parser_t *parser, pm_call_node_t *target, cons static pm_call_operator_write_node_t * pm_call_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(target->block == NULL); - pm_call_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_operator_write_node_t); - *node = (pm_call_operator_write_node_t) { - { - .type = PM_CALL_OPERATOR_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .message_loc = target->message_loc, - .read_name = 0, - .write_name = target->name, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; + pm_call_operator_write_node_t *node = pm_call_operator_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->message_loc, + 0, + target->name, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1), + TOK2LOC(parser, operator), + value + ); pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3075,36 +3335,28 @@ pm_call_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target, */ static pm_index_operator_write_node_t * pm_index_operator_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_index_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_operator_write_node_t); - pm_index_arguments_check(parser, target->arguments, target->block); assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); - *node = (pm_index_operator_write_node_t) { - { - .type = PM_INDEX_OPERATOR_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .opening_loc = target->opening_loc, - .arguments = target->arguments, - .closing_loc = target->closing_loc, - .block = (pm_block_argument_node_t *) target->block, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + pm_index_operator_write_node_t *node = pm_index_operator_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->opening_loc, + target->arguments, + target->closing_loc, + (pm_block_argument_node_t *) target->block, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1), + TOK2LOC(parser, operator), + value + ); + + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3116,33 +3368,25 @@ static pm_call_or_write_node_t * pm_call_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(target->block == NULL); assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_call_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_call_or_write_node_t); - *node = (pm_call_or_write_node_t) { - { - .type = PM_CALL_OR_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .message_loc = target->message_loc, - .read_name = 0, - .write_name = target->name, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; + pm_call_or_write_node_t *node = pm_call_or_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->message_loc, + 0, + target->name, + TOK2LOC(parser, operator), + value + ); pm_call_write_read_name_init(parser, &node->read_name, &node->write_name); - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3153,35 +3397,28 @@ pm_call_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const static pm_index_or_write_node_t * pm_index_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_index_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_index_or_write_node_t); pm_index_arguments_check(parser, target->arguments, target->block); assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); - *node = (pm_index_or_write_node_t) { - { - .type = PM_INDEX_OR_WRITE_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .opening_loc = target->opening_loc, - .arguments = target->arguments, - .closing_loc = target->closing_loc, - .block = (pm_block_argument_node_t *) target->block, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + pm_index_or_write_node_t *node = pm_index_or_write_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODES(target, value), + target->receiver, + target->call_operator_loc, + target->opening_loc, + target->arguments, + target->closing_loc, + (pm_block_argument_node_t *) target->block, + TOK2LOC(parser, operator), + value + ); + + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3192,25 +3429,27 @@ pm_index_or_write_node_create(pm_parser_t *parser, pm_call_node_t *target, const */ static pm_call_target_node_t * pm_call_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { - pm_call_target_node_t *node = PM_NODE_ALLOC(parser, pm_call_target_node_t); + pm_call_target_node_t *node = pm_call_target_node_new( + parser->arena, + ++parser->node_id, + FL(target), + PM_LOCATION_INIT_NODE(target), + target->receiver, + target->call_operator_loc, + target->name, + target->message_loc + ); - *node = (pm_call_target_node_t) { - { - .type = PM_CALL_TARGET_NODE, - .flags = target->base.flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = target->base.location - }, - .receiver = target->receiver, - .call_operator_loc = target->call_operator_loc, - .name = target->name, - .message_loc = target->message_loc - }; + /* It is possible to get here where we have parsed an invalid syntax tree + * where the call operator was not present. In that case we will have a + * problem because it is a required location. In this case we need to fill + * it in with a fake location so that the syntax tree remains valid. */ + if (node->call_operator_loc.length == 0) { + node->call_operator_loc = target->base.location; + } - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3221,30 +3460,23 @@ pm_call_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { */ static pm_index_target_node_t * pm_index_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { - pm_index_target_node_t *node = PM_NODE_ALLOC(parser, pm_index_target_node_t); - pm_node_flags_t flags = target->base.flags; - pm_index_arguments_check(parser, target->arguments, target->block); - assert(!target->block || PM_NODE_TYPE_P(target->block, PM_BLOCK_ARGUMENT_NODE)); - *node = (pm_index_target_node_t) { - { - .type = PM_INDEX_TARGET_NODE, - .flags = flags | PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = target->base.location - }, - .receiver = target->receiver, - .opening_loc = target->opening_loc, - .arguments = target->arguments, - .closing_loc = target->closing_loc, - .block = (pm_block_argument_node_t *) target->block, - }; - // Here we're going to free the target, since it is no longer necessary. - // However, we don't want to call `pm_node_destroy` because we want to keep - // around all of its children since we just reused them. - xfree(target); + pm_index_target_node_t *node = pm_index_target_node_new( + parser->arena, + ++parser->node_id, + FL(target) | PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE, + PM_LOCATION_INIT_NODE(target), + target->receiver, + target->opening_loc, + target->arguments, + target->closing_loc, + (pm_block_argument_node_t *) target->block + ); + + // The target is no longer necessary because we've reused its children. + // It is arena-allocated so no explicit free is needed. return node; } @@ -3254,23 +3486,15 @@ pm_index_target_node_create(pm_parser_t *parser, pm_call_node_t *target) { */ static pm_capture_pattern_node_t * pm_capture_pattern_node_create(pm_parser_t *parser, pm_node_t *value, pm_local_variable_target_node_t *target, const pm_token_t *operator) { - pm_capture_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_capture_pattern_node_t); - - *node = (pm_capture_pattern_node_t) { - { - .type = PM_CAPTURE_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = value->location.start, - .end = target->base.location.end - }, - }, - .value = value, - .target = target, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_capture_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(value, target), + value, + target, + TOK2LOC(parser, operator) + ); } /** @@ -3278,36 +3502,28 @@ pm_capture_pattern_node_create(pm_parser_t *parser, pm_node_t *value, pm_local_v */ static pm_case_node_t * pm_case_node_create(pm_parser_t *parser, const pm_token_t *case_keyword, pm_node_t *predicate, const pm_token_t *end_keyword) { - pm_case_node_t *node = PM_NODE_ALLOC(parser, pm_case_node_t); - - *node = (pm_case_node_t) { - { - .type = PM_CASE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = case_keyword->start, - .end = end_keyword->end - }, - }, - .predicate = predicate, - .else_clause = NULL, - .case_keyword_loc = PM_LOCATION_TOKEN_VALUE(case_keyword), - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), - .conditions = { 0 } - }; - - return node; + return pm_case_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, case_keyword, end_keyword == NULL ? case_keyword : end_keyword), + predicate, + ((pm_node_list_t) { 0 }), + NULL, + TOK2LOC(parser, case_keyword), + NTOK2LOC(parser, end_keyword) + ); } /** * Append a new condition to a CaseNode node. */ static void -pm_case_node_condition_append(pm_case_node_t *node, pm_node_t *condition) { +pm_case_node_condition_append(pm_arena_t *arena, pm_case_node_t *node, pm_node_t *condition) { assert(PM_NODE_TYPE_P(condition, PM_WHEN_NODE)); - pm_node_list_append(&node->conditions, condition); - node->base.location.end = condition->location.end; + pm_node_list_append(arena, &node->conditions, condition); + PM_NODE_LENGTH_SET_NODE(node, condition); } /** @@ -3316,53 +3532,45 @@ pm_case_node_condition_append(pm_case_node_t *node, pm_node_t *condition) { static void pm_case_node_else_clause_set(pm_case_node_t *node, pm_else_node_t *else_clause) { node->else_clause = else_clause; - node->base.location.end = else_clause->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, else_clause); } /** * Set the end location for a CaseNode node. */ static void -pm_case_node_end_keyword_loc_set(pm_case_node_t *node, const pm_token_t *end_keyword) { - node->base.location.end = end_keyword->end; - node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); +pm_case_node_end_keyword_loc_set(const pm_parser_t *parser, pm_case_node_t *node, const pm_token_t *end_keyword) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, end_keyword); + node->end_keyword_loc = TOK2LOC(parser, end_keyword); } /** * Allocate and initialize a new CaseMatchNode node. */ static pm_case_match_node_t * -pm_case_match_node_create(pm_parser_t *parser, const pm_token_t *case_keyword, pm_node_t *predicate, const pm_token_t *end_keyword) { - pm_case_match_node_t *node = PM_NODE_ALLOC(parser, pm_case_match_node_t); - - *node = (pm_case_match_node_t) { - { - .type = PM_CASE_MATCH_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = case_keyword->start, - .end = end_keyword->end - }, - }, - .predicate = predicate, - .else_clause = NULL, - .case_keyword_loc = PM_LOCATION_TOKEN_VALUE(case_keyword), - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), - .conditions = { 0 } - }; - - return node; +pm_case_match_node_create(pm_parser_t *parser, const pm_token_t *case_keyword, pm_node_t *predicate) { + return pm_case_match_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, case_keyword), + predicate, + ((pm_node_list_t) { 0 }), + NULL, + TOK2LOC(parser, case_keyword), + ((pm_location_t) { 0 }) + ); } /** * Append a new condition to a CaseMatchNode node. */ static void -pm_case_match_node_condition_append(pm_case_match_node_t *node, pm_node_t *condition) { +pm_case_match_node_condition_append(pm_arena_t *arena, pm_case_match_node_t *node, pm_node_t *condition) { assert(PM_NODE_TYPE_P(condition, PM_IN_NODE)); - pm_node_list_append(&node->conditions, condition); - node->base.location.end = condition->location.end; + pm_node_list_append(arena, &node->conditions, condition); + PM_NODE_LENGTH_SET_NODE(node, condition); } /** @@ -3371,16 +3579,16 @@ pm_case_match_node_condition_append(pm_case_match_node_t *node, pm_node_t *condi static void pm_case_match_node_else_clause_set(pm_case_match_node_t *node, pm_else_node_t *else_clause) { node->else_clause = else_clause; - node->base.location.end = else_clause->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, else_clause); } /** * Set the end location for a CaseMatchNode node. */ static void -pm_case_match_node_end_keyword_loc_set(pm_case_match_node_t *node, const pm_token_t *end_keyword) { - node->base.location.end = end_keyword->end; - node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); +pm_case_match_node_end_keyword_loc_set(const pm_parser_t *parser, pm_case_match_node_t *node, const pm_token_t *end_keyword) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, end_keyword); + node->end_keyword_loc = TOK2LOC(parser, end_keyword); } /** @@ -3388,25 +3596,20 @@ pm_case_match_node_end_keyword_loc_set(pm_case_match_node_t *node, const pm_toke */ static pm_class_node_t * pm_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *class_keyword, pm_node_t *constant_path, const pm_token_t *name, const pm_token_t *inheritance_operator, pm_node_t *superclass, pm_node_t *body, const pm_token_t *end_keyword) { - pm_class_node_t *node = PM_NODE_ALLOC(parser, pm_class_node_t); - - *node = (pm_class_node_t) { - { - .type = PM_CLASS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = class_keyword->start, .end = end_keyword->end }, - }, - .locals = *locals, - .class_keyword_loc = PM_LOCATION_TOKEN_VALUE(class_keyword), - .constant_path = constant_path, - .inheritance_operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(inheritance_operator), - .superclass = superclass, - .body = body, - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + return pm_class_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, class_keyword, end_keyword), + *locals, + TOK2LOC(parser, class_keyword), + constant_path, + NTOK2LOC(parser, inheritance_operator), + superclass, + body, + TOK2LOC(parser, end_keyword), + pm_parser_constant_id_token(parser, name) + ); } /** @@ -3415,24 +3618,17 @@ pm_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const p static pm_class_variable_and_write_node_t * pm_class_variable_and_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_class_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_and_write_node_t); - *node = (pm_class_variable_and_write_node_t) { - { - .type = PM_CLASS_VARIABLE_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_class_variable_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -3440,25 +3636,17 @@ pm_class_variable_and_write_node_create(pm_parser_t *parser, pm_class_variable_r */ static pm_class_variable_operator_write_node_t * pm_class_variable_operator_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_class_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_operator_write_node_t); - - *node = (pm_class_variable_operator_write_node_t) { - { - .type = PM_CLASS_VARIABLE_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) - }; - - return node; + return pm_class_variable_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1) + ); } /** @@ -3467,24 +3655,17 @@ pm_class_variable_operator_write_node_create(pm_parser_t *parser, pm_class_varia static pm_class_variable_or_write_node_t * pm_class_variable_or_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_class_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_or_write_node_t); - - *node = (pm_class_variable_or_write_node_t) { - { - .type = PM_CLASS_VARIABLE_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - return node; + return pm_class_variable_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -3493,18 +3674,14 @@ pm_class_variable_or_write_node_create(pm_parser_t *parser, pm_class_variable_re static pm_class_variable_read_node_t * pm_class_variable_read_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_CLASS_VARIABLE); - pm_class_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_read_node_t); - - *node = (pm_class_variable_read_node_t) { - { - .type = PM_CLASS_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .name = pm_parser_constant_id_token(parser, token) - }; - return node; + return pm_class_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token), + pm_parser_constant_id_token(parser, token) + ); } /** @@ -3513,9 +3690,9 @@ pm_class_variable_read_node_create(pm_parser_t *parser, const pm_token_t *token) * a = *b * a = 1, 2, 3 */ -static inline pm_node_flags_t +static PRISM_INLINE pm_node_flags_t pm_implicit_array_write_flags(const pm_node_t *node, pm_node_flags_t flags) { - if (PM_NODE_TYPE_P(node, PM_ARRAY_NODE) && ((const pm_array_node_t *) node)->opening_loc.start == NULL) { + if (PM_NODE_TYPE_P(node, PM_ARRAY_NODE) && ((const pm_array_node_t *) node)->opening_loc.length == 0) { return flags; } return 0; @@ -3526,25 +3703,16 @@ pm_implicit_array_write_flags(const pm_node_t *node, pm_node_flags_t flags) { */ static pm_class_variable_write_node_t * pm_class_variable_write_node_create(pm_parser_t *parser, pm_class_variable_read_node_t *read_node, pm_token_t *operator, pm_node_t *value) { - pm_class_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_class_variable_write_node_t); - - *node = (pm_class_variable_write_node_t) { - { - .type = PM_CLASS_VARIABLE_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = read_node->base.location.start, - .end = value->location.end - }, - }, - .name = read_node->name, - .name_loc = PM_LOCATION_NODE_VALUE((pm_node_t *) read_node), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_class_variable_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(read_node, value), + read_node->name, + read_node->base.location, + value, + TOK2LOC(parser, operator) + ); } /** @@ -3553,23 +3721,16 @@ pm_class_variable_write_node_create(pm_parser_t *parser, pm_class_variable_read_ static pm_constant_path_and_write_node_t * pm_constant_path_and_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_constant_path_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_and_write_node_t); - - *node = (pm_constant_path_and_write_node_t) { - { - .type = PM_CONSTANT_PATH_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .target = target, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - return node; + return pm_constant_path_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target, + TOK2LOC(parser, operator), + value + ); } /** @@ -3577,24 +3738,16 @@ pm_constant_path_and_write_node_create(pm_parser_t *parser, pm_constant_path_nod */ static pm_constant_path_operator_write_node_t * pm_constant_path_operator_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_constant_path_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_operator_write_node_t); - - *node = (pm_constant_path_operator_write_node_t) { - { - .type = PM_CONSTANT_PATH_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .target = target, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) - }; - - return node; + return pm_constant_path_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target, + TOK2LOC(parser, operator), + value, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1) + ); } /** @@ -3603,23 +3756,16 @@ pm_constant_path_operator_write_node_create(pm_parser_t *parser, pm_constant_pat static pm_constant_path_or_write_node_t * pm_constant_path_or_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_constant_path_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_or_write_node_t); - *node = (pm_constant_path_or_write_node_t) { - { - .type = PM_CONSTANT_PATH_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .target = target, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_constant_path_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target, + TOK2LOC(parser, operator), + value + ); } /** @@ -3628,29 +3774,22 @@ pm_constant_path_or_write_node_create(pm_parser_t *parser, pm_constant_path_node static pm_constant_path_node_t * pm_constant_path_node_create(pm_parser_t *parser, pm_node_t *parent, const pm_token_t *delimiter, const pm_token_t *name_token) { pm_assert_value_expression(parser, parent); - pm_constant_path_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_node_t); pm_constant_id_t name = PM_CONSTANT_ID_UNSET; if (name_token->type == PM_TOKEN_CONSTANT) { name = pm_parser_constant_id_token(parser, name_token); } - *node = (pm_constant_path_node_t) { - { - .type = PM_CONSTANT_PATH_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = parent == NULL ? delimiter->start : parent->location.start, - .end = name_token->end - }, - }, - .parent = parent, - .name = name, - .delimiter_loc = PM_LOCATION_TOKEN_VALUE(delimiter), - .name_loc = PM_LOCATION_TOKEN_VALUE(name_token) - }; - - return node; + return pm_constant_path_node_new( + parser->arena, + ++parser->node_id, + 0, + (parent == NULL) ? PM_LOCATION_INIT_TOKENS(parser, delimiter, name_token) : PM_LOCATION_INIT_NODE_TOKEN(parser, parent, name_token), + parent, + name, + TOK2LOC(parser, delimiter), + TOK2LOC(parser, name_token) + ); } /** @@ -3658,24 +3797,15 @@ pm_constant_path_node_create(pm_parser_t *parser, pm_node_t *parent, const pm_to */ static pm_constant_path_write_node_t * pm_constant_path_write_node_create(pm_parser_t *parser, pm_constant_path_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_constant_path_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_path_write_node_t); - - *node = (pm_constant_path_write_node_t) { - { - .type = PM_CONSTANT_PATH_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - }, - }, - .target = target, - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_constant_path_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(target, value), + target, + TOK2LOC(parser, operator), + value + ); } /** @@ -3684,24 +3814,17 @@ pm_constant_path_write_node_create(pm_parser_t *parser, pm_constant_path_node_t static pm_constant_and_write_node_t * pm_constant_and_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_constant_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_and_write_node_t); - *node = (pm_constant_and_write_node_t) { - { - .type = PM_CONSTANT_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_constant_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -3709,25 +3832,17 @@ pm_constant_and_write_node_create(pm_parser_t *parser, pm_constant_read_node_t * */ static pm_constant_operator_write_node_t * pm_constant_operator_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_constant_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_operator_write_node_t); - - *node = (pm_constant_operator_write_node_t) { - { - .type = PM_CONSTANT_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) - }; - - return node; + return pm_constant_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1) + ); } /** @@ -3736,24 +3851,17 @@ pm_constant_operator_write_node_create(pm_parser_t *parser, pm_constant_read_nod static pm_constant_or_write_node_t * pm_constant_or_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_constant_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_or_write_node_t); - - *node = (pm_constant_or_write_node_t) { - { - .type = PM_CONSTANT_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - return node; + return pm_constant_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -3761,19 +3869,15 @@ pm_constant_or_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *t */ static pm_constant_read_node_t * pm_constant_read_node_create(pm_parser_t *parser, const pm_token_t *name) { - assert(name->type == PM_TOKEN_CONSTANT || name->type == PM_TOKEN_MISSING); - pm_constant_read_node_t *node = PM_NODE_ALLOC(parser, pm_constant_read_node_t); - - *node = (pm_constant_read_node_t) { - { - .type = PM_CONSTANT_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name) - }, - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + assert(name->type == PM_TOKEN_CONSTANT || name->type == 0); + + return pm_constant_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_parser_constant_id_token(parser, name) + ); } /** @@ -3781,25 +3885,16 @@ pm_constant_read_node_create(pm_parser_t *parser, const pm_token_t *name) { */ static pm_constant_write_node_t * pm_constant_write_node_create(pm_parser_t *parser, pm_constant_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_constant_write_node_t *node = PM_NODE_ALLOC(parser, pm_constant_write_node_t); - - *node = (pm_constant_write_node_t) { - { - .type = PM_CONSTANT_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_constant_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + value, + TOK2LOC(parser, operator) + ); } /** @@ -3810,7 +3905,7 @@ pm_def_node_receiver_check(pm_parser_t *parser, const pm_node_t *node) { switch (PM_NODE_TYPE(node)) { case PM_BEGIN_NODE: { const pm_begin_node_t *cast = (pm_begin_node_t *) node; - if (cast->statements != NULL) pm_def_node_receiver_check(parser, (pm_node_t *) cast->statements); + if (cast->statements != NULL) pm_def_node_receiver_check(parser, UP(cast->statements)); break; } case PM_PARENTHESES_NODE: { @@ -3865,65 +3960,45 @@ pm_def_node_create( const pm_token_t *equal, const pm_token_t *end_keyword ) { - pm_def_node_t *node = PM_NODE_ALLOC(parser, pm_def_node_t); - const uint8_t *end; - - if (end_keyword->type == PM_TOKEN_NOT_PROVIDED) { - end = body->location.end; - } else { - end = end_keyword->end; - } - if (receiver != NULL) { pm_def_node_receiver_check(parser, receiver); } - *node = (pm_def_node_t) { - { - .type = PM_DEF_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = def_keyword->start, .end = end }, - }, - .name = name, - .name_loc = PM_LOCATION_TOKEN_VALUE(name_loc), - .receiver = receiver, - .parameters = parameters, - .body = body, - .locals = *locals, - .def_keyword_loc = PM_LOCATION_TOKEN_VALUE(def_keyword), - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen), - .rparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(rparen), - .equal_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(equal), - .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + return pm_def_node_new( + parser->arena, + ++parser->node_id, + 0, + (end_keyword == NULL) ? PM_LOCATION_INIT_TOKEN_NODE(parser, def_keyword, body) : PM_LOCATION_INIT_TOKENS(parser, def_keyword, end_keyword), + name, + TOK2LOC(parser, name_loc), + receiver, + parameters, + body, + *locals, + TOK2LOC(parser, def_keyword), + NTOK2LOC(parser, operator), + NTOK2LOC(parser, lparen), + NTOK2LOC(parser, rparen), + NTOK2LOC(parser, equal), + NTOK2LOC(parser, end_keyword) + ); } /** * Allocate a new DefinedNode node. */ static pm_defined_node_t * -pm_defined_node_create(pm_parser_t *parser, const pm_token_t *lparen, pm_node_t *value, const pm_token_t *rparen, const pm_location_t *keyword_loc) { - pm_defined_node_t *node = PM_NODE_ALLOC(parser, pm_defined_node_t); - - *node = (pm_defined_node_t) { - { - .type = PM_DEFINED_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword_loc->start, - .end = (rparen->type == PM_TOKEN_NOT_PROVIDED ? value->location.end : rparen->end) - }, - }, - .lparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(lparen), - .value = value, - .rparen_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(rparen), - .keyword_loc = *keyword_loc - }; - - return node; +pm_defined_node_create(pm_parser_t *parser, const pm_token_t *lparen, pm_node_t *value, const pm_token_t *rparen, const pm_token_t *keyword) { + return pm_defined_node_new( + parser->arena, + ++parser->node_id, + 0, + (rparen == NULL) ? PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, value) : PM_LOCATION_INIT_TOKENS(parser, keyword, rparen), + NTOK2LOC(parser, lparen), + value, + NTOK2LOC(parser, rparen), + TOK2LOC(parser, keyword) + ); } /** @@ -3931,29 +4006,15 @@ pm_defined_node_create(pm_parser_t *parser, const pm_token_t *lparen, pm_node_t */ static pm_else_node_t * pm_else_node_create(pm_parser_t *parser, const pm_token_t *else_keyword, pm_statements_node_t *statements, const pm_token_t *end_keyword) { - pm_else_node_t *node = PM_NODE_ALLOC(parser, pm_else_node_t); - const uint8_t *end = NULL; - if ((end_keyword->type == PM_TOKEN_NOT_PROVIDED) && (statements != NULL)) { - end = statements->base.location.end; - } else { - end = end_keyword->end; - } - - *node = (pm_else_node_t) { - { - .type = PM_ELSE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = else_keyword->start, - .end = end, - }, - }, - .else_keyword_loc = PM_LOCATION_TOKEN_VALUE(else_keyword), - .statements = statements, - .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + return pm_else_node_new( + parser->arena, + ++parser->node_id, + 0, + ((end_keyword == NULL) && (statements != NULL)) ? PM_LOCATION_INIT_TOKEN_NODE(parser, else_keyword, statements) : PM_LOCATION_INIT_TOKENS(parser, else_keyword, end_keyword), + TOK2LOC(parser, else_keyword), + statements, + NTOK2LOC(parser, end_keyword) + ); } /** @@ -3961,23 +4022,15 @@ pm_else_node_create(pm_parser_t *parser, const pm_token_t *else_keyword, pm_stat */ static pm_embedded_statements_node_t * pm_embedded_statements_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { - pm_embedded_statements_node_t *node = PM_NODE_ALLOC(parser, pm_embedded_statements_node_t); - - *node = (pm_embedded_statements_node_t) { - { - .type = PM_EMBEDDED_STATEMENTS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - } - }, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .statements = statements, - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) - }; - - return node; + return pm_embedded_statements_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + TOK2LOC(parser, opening), + statements, + TOK2LOC(parser, closing) + ); } /** @@ -3985,22 +4038,14 @@ pm_embedded_statements_node_create(pm_parser_t *parser, const pm_token_t *openin */ static pm_embedded_variable_node_t * pm_embedded_variable_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *variable) { - pm_embedded_variable_node_t *node = PM_NODE_ALLOC(parser, pm_embedded_variable_node_t); - - *node = (pm_embedded_variable_node_t) { - { - .type = PM_EMBEDDED_VARIABLE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = variable->location.end - } - }, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .variable = variable - }; - - return node; + return pm_embedded_variable_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, operator, variable), + TOK2LOC(parser, operator), + variable + ); } /** @@ -4008,23 +4053,15 @@ pm_embedded_variable_node_create(pm_parser_t *parser, const pm_token_t *operator */ static pm_ensure_node_t * pm_ensure_node_create(pm_parser_t *parser, const pm_token_t *ensure_keyword, pm_statements_node_t *statements, const pm_token_t *end_keyword) { - pm_ensure_node_t *node = PM_NODE_ALLOC(parser, pm_ensure_node_t); - - *node = (pm_ensure_node_t) { - { - .type = PM_ENSURE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = ensure_keyword->start, - .end = end_keyword->end - }, - }, - .ensure_keyword_loc = PM_LOCATION_TOKEN_VALUE(ensure_keyword), - .statements = statements, - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + return pm_ensure_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, ensure_keyword, end_keyword), + TOK2LOC(parser, ensure_keyword), + statements, + TOK2LOC(parser, end_keyword) + ); } /** @@ -4033,16 +4070,13 @@ pm_ensure_node_create(pm_parser_t *parser, const pm_token_t *ensure_keyword, pm_ static pm_false_node_t * pm_false_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_FALSE); - pm_false_node_t *node = PM_NODE_ALLOC(parser, pm_false_node_t); - *node = (pm_false_node_t) {{ - .type = PM_FALSE_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - - return node; + return pm_false_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -4051,50 +4085,31 @@ pm_false_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_find_pattern_node_t * pm_find_pattern_node_create(pm_parser_t *parser, pm_node_list_t *nodes) { - pm_find_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_find_pattern_node_t); - + assert(nodes->size >= 2); pm_node_t *left = nodes->nodes[0]; - assert(PM_NODE_TYPE_P(left, PM_SPLAT_NODE)); - pm_splat_node_t *left_splat_node = (pm_splat_node_t *) left; - - pm_node_t *right; + pm_node_t *right = nodes->nodes[nodes->size - 1]; - if (nodes->size == 1) { - right = (pm_node_t *) pm_missing_node_create(parser, left->location.end, left->location.end); - } else { - right = nodes->nodes[nodes->size - 1]; - assert(PM_NODE_TYPE_P(right, PM_SPLAT_NODE)); - } - -#if PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS - // FindPatternNode#right is typed as SplatNode in this case, so replace the potential MissingNode with a SplatNode. - // The resulting AST will anyway be ignored, but this file still needs to compile. - pm_splat_node_t *right_splat_node = PM_NODE_TYPE_P(right, PM_SPLAT_NODE) ? (pm_splat_node_t *) right : left_splat_node; -#else - pm_node_t *right_splat_node = right; -#endif - *node = (pm_find_pattern_node_t) { - { - .type = PM_FIND_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = left->location.start, - .end = right->location.end, - }, - }, - .constant = NULL, - .left = left_splat_node, - .right = right_splat_node, - .requireds = { 0 }, - .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; + assert(PM_NODE_TYPE_P(left, PM_SPLAT_NODE)); + assert(PM_NODE_TYPE_P(right, PM_SPLAT_NODE)); + + pm_find_pattern_node_t *node = pm_find_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(left, right), + NULL, + (pm_splat_node_t *) left, + ((pm_node_list_t) { 0 }), + (pm_splat_node_t *) right, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); // For now we're going to just copy over each pointer manually. This could be // much more efficient, as we could instead resize the node list to only point // to 1...-1. for (size_t index = 1; index < nodes->size - 1; index++) { - pm_node_list_append(&node->requireds, nodes->nodes[index]); + pm_node_list_append(parser->arena, &node->requireds, nodes->nodes[index]); } return node; @@ -4111,7 +4126,8 @@ pm_double_parse(pm_parser_t *parser, const pm_token_t *token) { // First, get a buffer of the content. size_t length = (size_t) diff; - char *buffer = xmalloc(sizeof(char) * (length + 1)); + const size_t buffer_size = sizeof(char) * (length + 1); + char *buffer = xmalloc(buffer_size); memcpy((void *) buffer, token->start, length); // Next, determine if we need to replace the decimal point because of @@ -4145,8 +4161,8 @@ pm_double_parse(pm_parser_t *parser, const pm_token_t *token) { // This should never happen, because we've already checked that the token // is in a valid format. However it's good to be safe. if ((eptr != buffer + length) || (errno != 0 && errno != ERANGE)) { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, (*token), PM_ERR_FLOAT_PARSE); - xfree((void *) buffer); + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, token, PM_ERR_FLOAT_PARSE); + xfree_sized(buffer, buffer_size); return 0.0; } @@ -4164,12 +4180,12 @@ pm_double_parse(pm_parser_t *parser, const pm_token_t *token) { ellipsis = ""; } - pm_diagnostic_list_append_format(&parser->warning_list, token->start, token->end, PM_WARN_FLOAT_OUT_OF_RANGE, warn_width, (const char *) token->start, ellipsis); + pm_diagnostic_list_append_format(&parser->metadata_arena, &parser->warning_list, PM_TOKEN_START(parser, token), PM_TOKEN_LENGTH(token), PM_WARN_FLOAT_OUT_OF_RANGE, warn_width, (const char *) token->start, ellipsis); value = (value < 0.0) ? -HUGE_VAL : HUGE_VAL; } // Finally we can free the buffer and return the value. - xfree((void *) buffer); + xfree_sized(buffer, buffer_size); return value; } @@ -4179,19 +4195,14 @@ pm_double_parse(pm_parser_t *parser, const pm_token_t *token) { static pm_float_node_t * pm_float_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_FLOAT); - pm_float_node_t *node = PM_NODE_ALLOC(parser, pm_float_node_t); - *node = (pm_float_node_t) { - { - .type = PM_FLOAT_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .value = pm_double_parse(parser, token) - }; - - return node; + return pm_float_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + pm_double_parse(parser, token) + ); } /** @@ -4201,22 +4212,17 @@ static pm_imaginary_node_t * pm_float_node_imaginary_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_FLOAT_IMAGINARY); - pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); - *node = (pm_imaginary_node_t) { - { - .type = PM_IMAGINARY_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numeric = (pm_node_t *) pm_float_node_create(parser, &((pm_token_t) { + return pm_imaginary_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + UP(pm_float_node_create(parser, &((pm_token_t) { .type = PM_TOKEN_FLOAT, .start = token->start, .end = token->end - 1 - })) - }; - - return node; + }))) + ); } /** @@ -4226,17 +4232,14 @@ static pm_rational_node_t * pm_float_node_rational_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_FLOAT_RATIONAL); - pm_rational_node_t *node = PM_NODE_ALLOC(parser, pm_rational_node_t); - *node = (pm_rational_node_t) { - { - .type = PM_RATIONAL_NODE, - .flags = PM_INTEGER_BASE_FLAGS_DECIMAL | PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numerator = { 0 }, - .denominator = { 0 } - }; + pm_rational_node_t *node = pm_rational_node_new( + parser->arena, + ++parser->node_id, + PM_INTEGER_BASE_FLAGS_DECIMAL | PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + ((pm_integer_t) { 0 }), + ((pm_integer_t) { 0 }) + ); const uint8_t *start = token->start; const uint8_t *end = token->end - 1; // r @@ -4263,12 +4266,18 @@ pm_float_node_rational_create(pm_parser_t *parser, const pm_token_t *token) { memcpy(digits + (point - start), point + 1, (unsigned long) (end - point - 1)); pm_integer_parse(&node->numerator, PM_INTEGER_BASE_DEFAULT, digits, digits + length - 1); + size_t fract_length = 0; + for (const uint8_t *fract = point; fract < end; ++fract) { + if (*fract != '_') ++fract_length; + } digits[0] = '1'; - if (end - point > 1) memset(digits + 1, '0', (size_t) (end - point - 1)); - pm_integer_parse(&node->denominator, PM_INTEGER_BASE_DEFAULT, digits, digits + (end - point)); - xfree(digits); + if (fract_length > 1) memset(digits + 1, '0', fract_length - 1); + pm_integer_parse(&node->denominator, PM_INTEGER_BASE_DEFAULT, digits, digits + fract_length); + xfree_sized(digits, length); pm_integers_reduce(&node->numerator, &node->denominator); + pm_integer_arena_move(parser->arena, &node->numerator); + pm_integer_arena_move(parser->arena, &node->denominator); return node; } @@ -4280,22 +4289,17 @@ static pm_imaginary_node_t * pm_float_node_rational_imaginary_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_FLOAT_RATIONAL_IMAGINARY); - pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); - *node = (pm_imaginary_node_t) { - { - .type = PM_IMAGINARY_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numeric = (pm_node_t *) pm_float_node_rational_create(parser, &((pm_token_t) { + return pm_imaginary_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + UP(pm_float_node_rational_create(parser, &((pm_token_t) { .type = PM_TOKEN_FLOAT_RATIONAL, .start = token->start, .end = token->end - 1 - })) - }; - - return node; + }))) + ); } /** @@ -4312,27 +4316,19 @@ pm_for_node_create( const pm_token_t *do_keyword, const pm_token_t *end_keyword ) { - pm_for_node_t *node = PM_NODE_ALLOC(parser, pm_for_node_t); - - *node = (pm_for_node_t) { - { - .type = PM_FOR_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = for_keyword->start, - .end = end_keyword->end - }, - }, - .index = index, - .collection = collection, - .statements = statements, - .for_keyword_loc = PM_LOCATION_TOKEN_VALUE(for_keyword), - .in_keyword_loc = PM_LOCATION_TOKEN_VALUE(in_keyword), - .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + return pm_for_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, for_keyword, end_keyword), + index, + collection, + statements, + TOK2LOC(parser, for_keyword), + TOK2LOC(parser, in_keyword), + NTOK2LOC(parser, do_keyword), + TOK2LOC(parser, end_keyword) + ); } /** @@ -4341,15 +4337,13 @@ pm_for_node_create( static pm_forwarding_arguments_node_t * pm_forwarding_arguments_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_UDOT_DOT_DOT); - pm_forwarding_arguments_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_arguments_node_t); - - *node = (pm_forwarding_arguments_node_t) {{ - .type = PM_FORWARDING_ARGUMENTS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - return node; + return pm_forwarding_arguments_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -4358,15 +4352,13 @@ pm_forwarding_arguments_node_create(pm_parser_t *parser, const pm_token_t *token static pm_forwarding_parameter_node_t * pm_forwarding_parameter_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_UDOT_DOT_DOT); - pm_forwarding_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_parameter_node_t); - - *node = (pm_forwarding_parameter_node_t) {{ - .type = PM_FORWARDING_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - return node; + return pm_forwarding_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -4376,26 +4368,20 @@ static pm_forwarding_super_node_t * pm_forwarding_super_node_create(pm_parser_t *parser, const pm_token_t *token, pm_arguments_t *arguments) { assert(arguments->block == NULL || PM_NODE_TYPE_P(arguments->block, PM_BLOCK_NODE)); assert(token->type == PM_TOKEN_KEYWORD_SUPER); - pm_forwarding_super_node_t *node = PM_NODE_ALLOC(parser, pm_forwarding_super_node_t); pm_block_node_t *block = NULL; if (arguments->block != NULL) { block = (pm_block_node_t *) arguments->block; } - *node = (pm_forwarding_super_node_t) { - { - .type = PM_FORWARDING_SUPER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = token->start, - .end = block != NULL ? block->base.location.end : token->end - }, - }, - .block = block - }; - - return node; + return pm_forwarding_super_node_new( + parser->arena, + ++parser->node_id, + 0, + (block == NULL) ? PM_LOCATION_INIT_TOKEN(parser, token) : PM_LOCATION_INIT_TOKEN_NODE(parser, token, block), + PM_LOCATION_INIT_TOKEN(parser, token), + block + ); } /** @@ -4404,25 +4390,17 @@ pm_forwarding_super_node_create(pm_parser_t *parser, const pm_token_t *token, pm */ static pm_hash_pattern_node_t * pm_hash_pattern_node_empty_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { - pm_hash_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_hash_pattern_node_t); - - *node = (pm_hash_pattern_node_t) { - { - .type = PM_HASH_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - }, - }, - .constant = NULL, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .elements = { 0 }, - .rest = NULL - }; - - return node; + return pm_hash_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + NULL, + ((pm_node_list_t) { 0 }), + NULL, + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -4430,46 +4408,36 @@ pm_hash_pattern_node_empty_create(pm_parser_t *parser, const pm_token_t *opening */ static pm_hash_pattern_node_t * pm_hash_pattern_node_node_list_create(pm_parser_t *parser, pm_node_list_t *elements, pm_node_t *rest) { - pm_hash_pattern_node_t *node = PM_NODE_ALLOC(parser, pm_hash_pattern_node_t); - - const uint8_t *start; - const uint8_t *end; + uint32_t start; + uint32_t end; if (elements->size > 0) { if (rest) { - start = elements->nodes[0]->location.start; - end = rest->location.end; + start = MIN(PM_NODE_START(rest), PM_NODE_START(elements->nodes[0])); + end = MAX(PM_NODE_END(rest), PM_NODE_END(elements->nodes[elements->size - 1])); } else { - start = elements->nodes[0]->location.start; - end = elements->nodes[elements->size - 1]->location.end; + start = PM_NODE_START(elements->nodes[0]); + end = PM_NODE_END(elements->nodes[elements->size - 1]); } } else { assert(rest != NULL); - start = rest->location.start; - end = rest->location.end; - } - - *node = (pm_hash_pattern_node_t) { - { - .type = PM_HASH_PATTERN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = start, - .end = end - }, - }, - .constant = NULL, - .elements = { 0 }, - .rest = rest, - .opening_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - pm_node_t *element; - PM_NODE_LIST_FOREACH(elements, index, element) { - pm_node_list_append(&node->elements, element); - } + start = PM_NODE_START(rest); + end = PM_NODE_END(rest); + } + + pm_hash_pattern_node_t *node = pm_hash_pattern_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NULL, + ((pm_node_list_t) { 0 }), + rest, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); + pm_node_list_concat(parser->arena, &node->elements, elements); return node; } @@ -4486,7 +4454,7 @@ pm_global_variable_write_name(pm_parser_t *parser, const pm_node_t *target) { case PM_NUMBERED_REFERENCE_READ_NODE: // This will only ever happen in the event of a syntax error, but we // still need to provide something for the node. - return pm_parser_constant_id_location(parser, target->location.start, target->location.end); + return pm_parser_constant_id_raw(parser, parser->start + PM_NODE_START(target), parser->start + PM_NODE_END(target)); default: assert(false && "unreachable"); return (pm_constant_id_t) -1; @@ -4499,24 +4467,17 @@ pm_global_variable_write_name(pm_parser_t *parser, const pm_node_t *target) { static pm_global_variable_and_write_node_t * pm_global_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_global_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_and_write_node_t); - - *node = (pm_global_variable_and_write_node_t) { - { - .type = PM_GLOBAL_VARIABLE_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name = pm_global_variable_write_name(parser, target), - .name_loc = target->location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - return node; + return pm_global_variable_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + pm_global_variable_write_name(parser, target), + target->location, + TOK2LOC(parser, operator), + value + ); } /** @@ -4524,25 +4485,17 @@ pm_global_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, */ static pm_global_variable_operator_write_node_t * pm_global_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_global_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_operator_write_node_t); - - *node = (pm_global_variable_operator_write_node_t) { - { - .type = PM_GLOBAL_VARIABLE_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name = pm_global_variable_write_name(parser, target), - .name_loc = target->location, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) - }; - - return node; + return pm_global_variable_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + pm_global_variable_write_name(parser, target), + target->location, + TOK2LOC(parser, operator), + value, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1) + ); } /** @@ -4551,24 +4504,17 @@ pm_global_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *ta static pm_global_variable_or_write_node_t * pm_global_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_global_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_or_write_node_t); - *node = (pm_global_variable_or_write_node_t) { - { - .type = PM_GLOBAL_VARIABLE_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name = pm_global_variable_write_name(parser, target), - .name_loc = target->location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_global_variable_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + pm_global_variable_write_name(parser, target), + target->location, + TOK2LOC(parser, operator), + value + ); } /** @@ -4576,18 +4522,13 @@ pm_global_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, */ static pm_global_variable_read_node_t * pm_global_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name) { - pm_global_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_read_node_t); - - *node = (pm_global_variable_read_node_t) { - { - .type = PM_GLOBAL_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name), - }, - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + return pm_global_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_parser_constant_id_token(parser, name) + ); } /** @@ -4595,18 +4536,13 @@ pm_global_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name) */ static pm_global_variable_read_node_t * pm_global_variable_read_node_synthesized_create(pm_parser_t *parser, pm_constant_id_t name) { - pm_global_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_read_node_t); - - *node = (pm_global_variable_read_node_t) { - { - .type = PM_GLOBAL_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .name = name - }; - - return node; + return pm_global_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + name + ); } /** @@ -4614,25 +4550,16 @@ pm_global_variable_read_node_synthesized_create(pm_parser_t *parser, pm_constant */ static pm_global_variable_write_node_t * pm_global_variable_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_global_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_write_node_t); - - *node = (pm_global_variable_write_node_t) { - { - .type = PM_GLOBAL_VARIABLE_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .location = { - .start = target->location.start, - .end = value->location.end - }, - }, - .name = pm_global_variable_write_name(parser, target), - .name_loc = PM_LOCATION_NODE_VALUE(target), - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_global_variable_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(target, value), + pm_global_variable_write_name(parser, target), + target->location, + value, + TOK2LOC(parser, operator) + ); } /** @@ -4640,21 +4567,16 @@ pm_global_variable_write_node_create(pm_parser_t *parser, pm_node_t *target, con */ static pm_global_variable_write_node_t * pm_global_variable_write_node_synthesized_create(pm_parser_t *parser, pm_constant_id_t name, pm_node_t *value) { - pm_global_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_global_variable_write_node_t); - - *node = (pm_global_variable_write_node_t) { - { - .type = PM_GLOBAL_VARIABLE_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .name = name, - .name_loc = PM_LOCATION_NULL_VALUE(parser), - .operator_loc = PM_LOCATION_NULL_VALUE(parser), - .value = value - }; - - return node; + return pm_global_variable_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + name, + ((pm_location_t) { 0 }), + value, + ((pm_location_t) { 0 }) + ); } /** @@ -4663,29 +4585,24 @@ pm_global_variable_write_node_synthesized_create(pm_parser_t *parser, pm_constan static pm_hash_node_t * pm_hash_node_create(pm_parser_t *parser, const pm_token_t *opening) { assert(opening != NULL); - pm_hash_node_t *node = PM_NODE_ALLOC(parser, pm_hash_node_t); - - *node = (pm_hash_node_t) { - { - .type = PM_HASH_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(opening) - }, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_NULL_VALUE(parser), - .elements = { 0 } - }; - return node; + return pm_hash_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, opening), + TOK2LOC(parser, opening), + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }) + ); } /** * Append a new element to a hash node. */ -static inline void -pm_hash_node_elements_append(pm_hash_node_t *hash, pm_node_t *element) { - pm_node_list_append(&hash->elements, element); +static PRISM_INLINE void +pm_hash_node_elements_append(pm_arena_t *arena, pm_hash_node_t *hash, pm_node_t *element) { + pm_node_list_append(arena, &hash->elements, element); bool static_literal = PM_NODE_TYPE_P(element, PM_ASSOC_NODE); if (static_literal) { @@ -4696,14 +4613,14 @@ pm_hash_node_elements_append(pm_hash_node_t *hash, pm_node_t *element) { } if (!static_literal) { - pm_node_flag_unset((pm_node_t *)hash, PM_NODE_FLAG_STATIC_LITERAL); + pm_node_flag_unset(UP(hash), PM_NODE_FLAG_STATIC_LITERAL); } } -static inline void -pm_hash_node_closing_loc_set(pm_hash_node_t *hash, pm_token_t *token) { - hash->base.location.end = token->end; - hash->closing_loc = PM_LOCATION_TOKEN_VALUE(token); +static PRISM_INLINE void +pm_hash_node_closing_loc_set(const pm_parser_t *parser, pm_hash_node_t *hash, pm_token_t *token) { + PM_NODE_LENGTH_SET_TOKEN(parser, hash, token); + hash->closing_loc = TOK2LOC(parser, token); } /** @@ -4719,38 +4636,32 @@ pm_if_node_create(pm_parser_t *parser, const pm_token_t *end_keyword ) { pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); - const uint8_t *end; - if (end_keyword->type != PM_TOKEN_NOT_PROVIDED) { - end = end_keyword->end; + uint32_t start = PM_TOKEN_START(parser, if_keyword); + uint32_t end; + + if (end_keyword != NULL) { + end = PM_TOKEN_END(parser, end_keyword); } else if (subsequent != NULL) { - end = subsequent->location.end; + end = PM_NODE_END(subsequent); } else if (pm_statements_node_body_length(statements) != 0) { - end = statements->base.location.end; + end = PM_NODE_END(statements); } else { - end = predicate->location.end; - } - - *node = (pm_if_node_t) { - { - .type = PM_IF_NODE, - .flags = PM_NODE_FLAG_NEWLINE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = if_keyword->start, - .end = end - }, - }, - .if_keyword_loc = PM_LOCATION_TOKEN_VALUE(if_keyword), - .predicate = predicate, - .then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword), - .statements = statements, - .subsequent = subsequent, - .end_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + end = PM_NODE_END(predicate); + } + + return pm_if_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_NEWLINE, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + TOK2LOC(parser, if_keyword), + predicate, + NTOK2LOC(parser, then_keyword), + statements, + subsequent, + NTOK2LOC(parser, end_keyword) + ); } /** @@ -4759,30 +4670,22 @@ pm_if_node_create(pm_parser_t *parser, static pm_if_node_t * pm_if_node_modifier_create(pm_parser_t *parser, pm_node_t *statement, const pm_token_t *if_keyword, pm_node_t *predicate) { pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); pm_statements_node_t *statements = pm_statements_node_create(parser); pm_statements_node_body_append(parser, statements, statement, true); - *node = (pm_if_node_t) { - { - .type = PM_IF_NODE, - .flags = PM_NODE_FLAG_NEWLINE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = statement->location.start, - .end = predicate->location.end - }, - }, - .if_keyword_loc = PM_LOCATION_TOKEN_VALUE(if_keyword), - .predicate = predicate, - .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .statements = statements, - .subsequent = NULL, - .end_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + return pm_if_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_NEWLINE, + PM_LOCATION_INIT_NODES(statement, predicate), + TOK2LOC(parser, if_keyword), + predicate, + ((pm_location_t) { 0 }), + statements, + NULL, + ((pm_location_t) { 0 }) + ); } /** @@ -4799,43 +4702,31 @@ pm_if_node_ternary_create(pm_parser_t *parser, pm_node_t *predicate, const pm_to pm_statements_node_t *else_statements = pm_statements_node_create(parser); pm_statements_node_body_append(parser, else_statements, false_expression, true); - pm_token_t end_keyword = not_provided(parser); - pm_else_node_t *else_node = pm_else_node_create(parser, colon, else_statements, &end_keyword); - - pm_if_node_t *node = PM_NODE_ALLOC(parser, pm_if_node_t); - - *node = (pm_if_node_t) { - { - .type = PM_IF_NODE, - .flags = PM_NODE_FLAG_NEWLINE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = predicate->location.start, - .end = false_expression->location.end, - }, - }, - .if_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .predicate = predicate, - .then_keyword_loc = PM_LOCATION_TOKEN_VALUE(qmark), - .statements = if_statements, - .subsequent = (pm_node_t *) else_node, - .end_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; - + pm_else_node_t *else_node = pm_else_node_create(parser, colon, else_statements, NULL); + return pm_if_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_NEWLINE, + PM_LOCATION_INIT_NODES(predicate, false_expression), + ((pm_location_t) { 0 }), + predicate, + TOK2LOC(parser, qmark), + if_statements, + UP(else_node), + ((pm_location_t) { 0 }) + ); } -static inline void -pm_if_node_end_keyword_loc_set(pm_if_node_t *node, const pm_token_t *keyword) { - node->base.location.end = keyword->end; - node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword); +static PRISM_INLINE void +pm_if_node_end_keyword_loc_set(const pm_parser_t *parser, pm_if_node_t *node, const pm_token_t *keyword) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, keyword); + node->end_keyword_loc = TOK2LOC(parser, keyword); } -static inline void -pm_else_node_end_keyword_loc_set(pm_else_node_t *node, const pm_token_t *keyword) { - node->base.location.end = keyword->end; - node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword); +static PRISM_INLINE void +pm_else_node_end_keyword_loc_set(const pm_parser_t *parser, pm_else_node_t *node, const pm_token_t *keyword) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, keyword); + node->end_keyword_loc = TOK2LOC(parser, keyword); } /** @@ -4843,18 +4734,13 @@ pm_else_node_end_keyword_loc_set(pm_else_node_t *node, const pm_token_t *keyword */ static pm_implicit_node_t * pm_implicit_node_create(pm_parser_t *parser, pm_node_t *value) { - pm_implicit_node_t *node = PM_NODE_ALLOC(parser, pm_implicit_node_t); - - *node = (pm_implicit_node_t) { - { - .type = PM_IMPLICIT_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = value->location - }, - .value = value - }; - - return node; + return pm_implicit_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODE(value), + value + ); } /** @@ -4864,17 +4750,12 @@ static pm_implicit_rest_node_t * pm_implicit_rest_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_COMMA); - pm_implicit_rest_node_t *node = PM_NODE_ALLOC(parser, pm_implicit_rest_node_t); - - *node = (pm_implicit_rest_node_t) { - { - .type = PM_IMPLICIT_REST_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - } - }; - - return node; + return pm_implicit_rest_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -4883,28 +4764,33 @@ pm_implicit_rest_node_create(pm_parser_t *parser, const pm_token_t *token) { static pm_integer_node_t * pm_integer_node_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { assert(token->type == PM_TOKEN_INTEGER); - pm_integer_node_t *node = PM_NODE_ALLOC(parser, pm_integer_node_t); - *node = (pm_integer_node_t) { - { - .type = PM_INTEGER_NODE, - .flags = base | PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .value = { 0 } - }; + pm_integer_node_t *node = pm_integer_node_new( + parser->arena, + ++parser->node_id, + base | PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + ((pm_integer_t) { 0 }) + ); - pm_integer_base_t integer_base = PM_INTEGER_BASE_DECIMAL; - switch (base) { - case PM_INTEGER_BASE_FLAGS_BINARY: integer_base = PM_INTEGER_BASE_BINARY; break; - case PM_INTEGER_BASE_FLAGS_OCTAL: integer_base = PM_INTEGER_BASE_OCTAL; break; - case PM_INTEGER_BASE_FLAGS_DECIMAL: break; - case PM_INTEGER_BASE_FLAGS_HEXADECIMAL: integer_base = PM_INTEGER_BASE_HEXADECIMAL; break; - default: assert(false && "unreachable"); break; + if (parser->integer.lexed) { + // The value was already computed during lexing. + node->value.value = parser->integer.value; + parser->integer.lexed = false; + } else { + pm_integer_base_t integer_base = PM_INTEGER_BASE_DECIMAL; + switch (base) { + case PM_INTEGER_BASE_FLAGS_BINARY: integer_base = PM_INTEGER_BASE_BINARY; break; + case PM_INTEGER_BASE_FLAGS_OCTAL: integer_base = PM_INTEGER_BASE_OCTAL; break; + case PM_INTEGER_BASE_FLAGS_DECIMAL: break; + case PM_INTEGER_BASE_FLAGS_HEXADECIMAL: integer_base = PM_INTEGER_BASE_HEXADECIMAL; break; + default: assert(false && "unreachable"); break; + } + + pm_integer_parse(&node->value, integer_base, token->start, token->end); + pm_integer_arena_move(parser->arena, &node->value); } - pm_integer_parse(&node->value, integer_base, token->start, token->end); return node; } @@ -4916,22 +4802,17 @@ static pm_imaginary_node_t * pm_integer_node_imaginary_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { assert(token->type == PM_TOKEN_INTEGER_IMAGINARY); - pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); - *node = (pm_imaginary_node_t) { - { - .type = PM_IMAGINARY_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numeric = (pm_node_t *) pm_integer_node_create(parser, base, &((pm_token_t) { + return pm_imaginary_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + UP(pm_integer_node_create(parser, base, &((pm_token_t) { .type = PM_TOKEN_INTEGER, .start = token->start, .end = token->end - 1 - })) - }; - - return node; + }))) + ); } /** @@ -4942,17 +4823,14 @@ static pm_rational_node_t * pm_integer_node_rational_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { assert(token->type == PM_TOKEN_INTEGER_RATIONAL); - pm_rational_node_t *node = PM_NODE_ALLOC(parser, pm_rational_node_t); - *node = (pm_rational_node_t) { - { - .type = PM_RATIONAL_NODE, - .flags = base | PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numerator = { 0 }, - .denominator = { .value = 1, 0 } - }; + pm_rational_node_t *node = pm_rational_node_new( + parser->arena, + ++parser->node_id, + base | PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + ((pm_integer_t) { 0 }), + ((pm_integer_t) { .value = 1 }) + ); pm_integer_base_t integer_base = PM_INTEGER_BASE_DECIMAL; switch (base) { @@ -4964,6 +4842,7 @@ pm_integer_node_rational_create(pm_parser_t *parser, pm_node_flags_t base, const } pm_integer_parse(&node->numerator, integer_base, token->start, token->end - 1); + pm_integer_arena_move(parser->arena, &node->numerator); return node; } @@ -4976,22 +4855,17 @@ static pm_imaginary_node_t * pm_integer_node_rational_imaginary_create(pm_parser_t *parser, pm_node_flags_t base, const pm_token_t *token) { assert(token->type == PM_TOKEN_INTEGER_RATIONAL_IMAGINARY); - pm_imaginary_node_t *node = PM_NODE_ALLOC(parser, pm_imaginary_node_t); - *node = (pm_imaginary_node_t) { - { - .type = PM_IMAGINARY_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .numeric = (pm_node_t *) pm_integer_node_rational_create(parser, base, &((pm_token_t) { + return pm_imaginary_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token), + UP(pm_integer_node_rational_create(parser, base, &((pm_token_t) { .type = PM_TOKEN_INTEGER_RATIONAL, .start = token->start, .end = token->end - 1 - })) - }; - - return node; + }))) + ); } /** @@ -4999,33 +4873,27 @@ pm_integer_node_rational_imaginary_create(pm_parser_t *parser, pm_node_flags_t b */ static pm_in_node_t * pm_in_node_create(pm_parser_t *parser, pm_node_t *pattern, pm_statements_node_t *statements, const pm_token_t *in_keyword, const pm_token_t *then_keyword) { - pm_in_node_t *node = PM_NODE_ALLOC(parser, pm_in_node_t); + uint32_t start = PM_TOKEN_START(parser, in_keyword); + uint32_t end; - const uint8_t *end; if (statements != NULL) { - end = statements->base.location.end; - } else if (then_keyword->type != PM_TOKEN_NOT_PROVIDED) { - end = then_keyword->end; + end = PM_NODE_END(statements); + } else if (then_keyword != NULL) { + end = PM_TOKEN_END(parser, then_keyword); } else { - end = pattern->location.end; - } - - *node = (pm_in_node_t) { - { - .type = PM_IN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = in_keyword->start, - .end = end - }, - }, - .pattern = pattern, - .statements = statements, - .in_loc = PM_LOCATION_TOKEN_VALUE(in_keyword), - .then_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword) - }; - - return node; + end = PM_NODE_END(pattern); + } + + return pm_in_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + pattern, + statements, + TOK2LOC(parser, in_keyword), + NTOK2LOC(parser, then_keyword) + ); } /** @@ -5034,24 +4902,17 @@ pm_in_node_create(pm_parser_t *parser, pm_node_t *pattern, pm_statements_node_t static pm_instance_variable_and_write_node_t * pm_instance_variable_and_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_instance_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_and_write_node_t); - *node = (pm_instance_variable_and_write_node_t) { - { - .type = PM_INSTANCE_VARIABLE_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_instance_variable_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -5059,25 +4920,17 @@ pm_instance_variable_and_write_node_create(pm_parser_t *parser, pm_instance_vari */ static pm_instance_variable_operator_write_node_t * pm_instance_variable_operator_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_instance_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_operator_write_node_t); - - *node = (pm_instance_variable_operator_write_node_t) { - { - .type = PM_INSTANCE_VARIABLE_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1) - }; - - return node; + return pm_instance_variable_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1) + ); } /** @@ -5086,24 +4939,17 @@ pm_instance_variable_operator_write_node_create(pm_parser_t *parser, pm_instance static pm_instance_variable_or_write_node_t * pm_instance_variable_or_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *target, const pm_token_t *operator, pm_node_t *value) { assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_instance_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_or_write_node_t); - - *node = (pm_instance_variable_or_write_node_t) { - { - .type = PM_INSTANCE_VARIABLE_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .name = target->name, - .name_loc = target->base.location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - return node; + return pm_instance_variable_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->name, + target->base.location, + TOK2LOC(parser, operator), + value + ); } /** @@ -5112,18 +4958,14 @@ pm_instance_variable_or_write_node_create(pm_parser_t *parser, pm_instance_varia static pm_instance_variable_read_node_t * pm_instance_variable_read_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_INSTANCE_VARIABLE); - pm_instance_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_read_node_t); - - *node = (pm_instance_variable_read_node_t) { - { - .type = PM_INSTANCE_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .name = pm_parser_constant_id_token(parser, token) - }; - return node; + return pm_instance_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token), + pm_parser_constant_id_token(parser, token) + ); } /** @@ -5132,24 +4974,16 @@ pm_instance_variable_read_node_create(pm_parser_t *parser, const pm_token_t *tok */ static pm_instance_variable_write_node_t * pm_instance_variable_write_node_create(pm_parser_t *parser, pm_instance_variable_read_node_t *read_node, pm_token_t *operator, pm_node_t *value) { - pm_instance_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_instance_variable_write_node_t); - *node = (pm_instance_variable_write_node_t) { - { - .type = PM_INSTANCE_VARIABLE_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = read_node->base.location.start, - .end = value->location.end - } - }, - .name = read_node->name, - .name_loc = PM_LOCATION_NODE_BASE_VALUE(read_node), - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_instance_variable_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(read_node, value), + read_node->name, + read_node->base.location, + value, + TOK2LOC(parser, operator) + ); } /** @@ -5158,7 +4992,7 @@ pm_instance_variable_write_node_create(pm_parser_t *parser, pm_instance_variable * literals. */ static void -pm_interpolated_node_append(pm_node_t *node, pm_node_list_t *parts, pm_node_t *part) { +pm_interpolated_node_append(pm_arena_t *arena, pm_node_t *node, pm_node_list_t *parts, pm_node_t *part) { switch (PM_NODE_TYPE(part)) { case PM_STRING_NODE: pm_node_flag_set(part, PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN); @@ -5186,14 +5020,14 @@ pm_interpolated_node_append(pm_node_t *node, pm_node_list_t *parts, pm_node_t *p break; } case PM_EMBEDDED_VARIABLE_NODE: - pm_node_flag_unset((pm_node_t *) node, PM_NODE_FLAG_STATIC_LITERAL); + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); break; default: assert(false && "unexpected node type"); break; } - pm_node_list_append(parts, part); + pm_node_list_append(arena, parts, part); } /** @@ -5201,43 +5035,34 @@ pm_interpolated_node_append(pm_node_t *node, pm_node_list_t *parts, pm_node_t *p */ static pm_interpolated_regular_expression_node_t * pm_interpolated_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening) { - pm_interpolated_regular_expression_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_regular_expression_node_t); - - *node = (pm_interpolated_regular_expression_node_t) { - { - .type = PM_INTERPOLATED_REGULAR_EXPRESSION_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = NULL, - }, - }, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(opening), - .parts = { 0 } - }; - - return node; + return pm_interpolated_regular_expression_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, opening), + TOK2LOC(parser, opening), + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, opening) + ); } -static inline void -pm_interpolated_regular_expression_node_append(pm_interpolated_regular_expression_node_t *node, pm_node_t *part) { - if (node->base.location.start > part->location.start) { - node->base.location.start = part->location.start; +static PRISM_INLINE void +pm_interpolated_regular_expression_node_append(pm_arena_t *arena, pm_interpolated_regular_expression_node_t *node, pm_node_t *part) { + if (PM_NODE_START(node) > PM_NODE_START(part)) { + PM_NODE_START_SET_NODE(node, part); } - if (node->base.location.end < part->location.end) { - node->base.location.end = part->location.end; + if (PM_NODE_END(node) < PM_NODE_END(part)) { + PM_NODE_LENGTH_SET_NODE(node, part); } - pm_interpolated_node_append((pm_node_t *) node, &node->parts, part); + pm_interpolated_node_append(arena, UP(node), &node->parts, part); } -static inline void +static PRISM_INLINE void pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_interpolated_regular_expression_node_t *node, const pm_token_t *closing) { - node->closing_loc = PM_LOCATION_TOKEN_VALUE(closing); - node->base.location.end = closing->end; - pm_node_flag_set((pm_node_t *) node, pm_regular_expression_flags_create(parser, closing)); + node->closing_loc = TOK2LOC(parser, closing); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); + pm_node_flag_set(UP(node), pm_regular_expression_flags_create(parser, closing)); } /** @@ -5249,7 +5074,7 @@ pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_inte * PM_NODE_FLAG_STATIC_LITERAL indicates that the node should be treated as a * single static literal string that can be pushed onto the stack on its own. * Note that this doesn't necessarily mean that the string will be frozen or - * not; the instructions in CRuby will be either putobject or putstring, + * not; the instructions in CRuby will be either putobject, dupstring or dupchilledstring, * depending on the combination of `--enable-frozen-string-literal`, * `# frozen_string_literal: true`, and whether or not there is interpolation. * @@ -5263,22 +5088,31 @@ pm_interpolated_regular_expression_node_closing_set(pm_parser_t *parser, pm_inte * is necessary to indicate that the string should be left up to the runtime, * which could potentially use a chilled string otherwise. */ -static inline void -pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_t *part) { +static PRISM_INLINE void +pm_interpolated_string_node_append(pm_parser_t *parser, pm_interpolated_string_node_t *node, pm_node_t *part) { + pm_arena_t *arena = parser->arena; #define CLEAR_FLAGS(node) \ - node->base.flags = (pm_node_flags_t) (node->base.flags & ~(PM_NODE_FLAG_STATIC_LITERAL | PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE)) + node->base.flags = (pm_node_flags_t) (FL(node) & ~(PM_NODE_FLAG_STATIC_LITERAL | PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE)) #define MUTABLE_FLAGS(node) \ - node->base.flags = (pm_node_flags_t) ((node->base.flags | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE) & ~PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN); + node->base.flags = (pm_node_flags_t) ((FL(node) | PM_INTERPOLATED_STRING_NODE_FLAGS_MUTABLE) & ~PM_INTERPOLATED_STRING_NODE_FLAGS_FROZEN); - if (node->parts.size == 0 && node->opening_loc.start == NULL) { - node->base.location.start = part->location.start; + if (node->parts.size == 0 && node->opening_loc.length == 0) { + PM_NODE_START_SET_NODE(node, part); } - node->base.location.end = MAX(node->base.location.end, part->location.end); + if (PM_NODE_END(part) > PM_NODE_END(node)) { + PM_NODE_LENGTH_SET_NODE(node, part); + } switch (PM_NODE_TYPE(part)) { case PM_STRING_NODE: + // If inner string is not frozen, it stops being a static literal. We should *not* clear other flags, + // because concatenating two frozen strings (`'foo' 'bar'`) is still frozen. This holds true for + // as long as this interpolation only consists of other string literals. + if (!PM_NODE_FLAG_P(part, PM_STRING_FLAGS_FROZEN)) { + pm_node_flag_unset(UP(node), PM_NODE_FLAG_STATIC_LITERAL); + } part->flags = (pm_node_flags_t) ((part->flags | PM_NODE_FLAG_STATIC_LITERAL | PM_STRING_FLAGS_FROZEN) & ~PM_STRING_FLAGS_MUTABLE); break; case PM_INTERPOLATED_STRING_NODE: @@ -5330,8 +5164,14 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_ break; case PM_X_STRING_NODE: case PM_INTERPOLATED_X_STRING_NODE: - // If this is an x string, then this is a syntax error. But we want - // to handle it here so that we don't fail the assertion. + case PM_SYMBOL_NODE: + case PM_INTERPOLATED_SYMBOL_NODE: + // These will only happen in error cases. But we want to handle it + // here so that we don't fail the assertion. + CLEAR_FLAGS(node); + pm_node_list_append(arena, &node->parts, UP(pm_error_recovery_node_create_unexpected(parser, part))); + return; + case PM_ERROR_RECOVERY_NODE: CLEAR_FLAGS(node); break; default: @@ -5339,7 +5179,7 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_ break; } - pm_node_list_append(&node->parts, part); + pm_node_list_append(arena, &node->parts, part); #undef CLEAR_FLAGS #undef MUTABLE_FLAGS @@ -5350,7 +5190,6 @@ pm_interpolated_string_node_append(pm_interpolated_string_node_t *node, pm_node_ */ static pm_interpolated_string_node_t * pm_interpolated_string_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_node_list_t *parts, const pm_token_t *closing) { - pm_interpolated_string_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_string_node_t); pm_node_flags_t flags = PM_NODE_FLAG_STATIC_LITERAL; switch (parser->frozen_string_literal) { @@ -5362,25 +5201,23 @@ pm_interpolated_string_node_create(pm_parser_t *parser, const pm_token_t *openin break; } - *node = (pm_interpolated_string_node_t) { - { - .type = PM_INTERPOLATED_STRING_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end, - }, - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .parts = { 0 } - }; + uint32_t start = opening == NULL ? 0 : PM_TOKEN_START(parser, opening); + uint32_t end = closing == NULL ? 0 : PM_TOKEN_END(parser, closing); + + pm_interpolated_string_node_t *node = pm_interpolated_string_node_new( + parser->arena, + ++parser->node_id, + flags, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NTOK2LOC(parser, opening), + ((pm_node_list_t) { 0 }), + NTOK2LOC(parser, closing) + ); if (parts != NULL) { pm_node_t *part; PM_NODE_LIST_FOREACH(parts, index, part) { - pm_interpolated_string_node_append(node, part); + pm_interpolated_string_node_append(parser, node, part); } } @@ -5391,25 +5228,28 @@ pm_interpolated_string_node_create(pm_parser_t *parser, const pm_token_t *openin * Set the closing token of the given InterpolatedStringNode node. */ static void -pm_interpolated_string_node_closing_set(pm_interpolated_string_node_t *node, const pm_token_t *closing) { - node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); - node->base.location.end = closing->end; +pm_interpolated_string_node_closing_set(const pm_parser_t *parser, pm_interpolated_string_node_t *node, const pm_token_t *closing) { + node->closing_loc = TOK2LOC(parser, closing); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); } static void -pm_interpolated_symbol_node_append(pm_interpolated_symbol_node_t *node, pm_node_t *part) { - if (node->parts.size == 0 && node->opening_loc.start == NULL) { - node->base.location.start = part->location.start; +pm_interpolated_symbol_node_append(pm_arena_t *arena, pm_interpolated_symbol_node_t *node, pm_node_t *part) { + if (node->parts.size == 0 && node->opening_loc.length == 0) { + PM_NODE_START_SET_NODE(node, part); } - pm_interpolated_node_append((pm_node_t *) node, &node->parts, part); - node->base.location.end = MAX(node->base.location.end, part->location.end); + pm_interpolated_node_append(arena, UP(node), &node->parts, part); + + if (PM_NODE_END(part) > PM_NODE_END(node)) { + PM_NODE_LENGTH_SET_NODE(node, part); + } } static void -pm_interpolated_symbol_node_closing_loc_set(pm_interpolated_symbol_node_t *node, const pm_token_t *closing) { - node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); - node->base.location.end = closing->end; +pm_interpolated_symbol_node_closing_loc_set(const pm_parser_t *parser, pm_interpolated_symbol_node_t *node, const pm_token_t *closing) { + node->closing_loc = TOK2LOC(parser, closing); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); } /** @@ -5417,27 +5257,23 @@ pm_interpolated_symbol_node_closing_loc_set(pm_interpolated_symbol_node_t *node, */ static pm_interpolated_symbol_node_t * pm_interpolated_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_node_list_t *parts, const pm_token_t *closing) { - pm_interpolated_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_symbol_node_t); - - *node = (pm_interpolated_symbol_node_t) { - { - .type = PM_INTERPOLATED_SYMBOL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end, - }, - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .parts = { 0 } - }; + uint32_t start = opening == NULL ? 0 : PM_TOKEN_START(parser, opening); + uint32_t end = closing == NULL ? 0 : PM_TOKEN_END(parser, closing); + + pm_interpolated_symbol_node_t *node = pm_interpolated_symbol_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NTOK2LOC(parser, opening), + ((pm_node_list_t) { 0 }), + NTOK2LOC(parser, closing) + ); if (parts != NULL) { pm_node_t *part; PM_NODE_LIST_FOREACH(parts, index, part) { - pm_interpolated_symbol_node_append(node, part); + pm_interpolated_symbol_node_append(parser->arena, node, part); } } @@ -5449,35 +5285,27 @@ pm_interpolated_symbol_node_create(pm_parser_t *parser, const pm_token_t *openin */ static pm_interpolated_x_string_node_t * pm_interpolated_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { - pm_interpolated_x_string_node_t *node = PM_NODE_ALLOC(parser, pm_interpolated_x_string_node_t); - - *node = (pm_interpolated_x_string_node_t) { - { - .type = PM_INTERPOLATED_X_STRING_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - }, - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .parts = { 0 } - }; - - return node; + return pm_interpolated_x_string_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + TOK2LOC(parser, opening), + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, closing) + ); } -static inline void -pm_interpolated_xstring_node_append(pm_interpolated_x_string_node_t *node, pm_node_t *part) { - pm_interpolated_node_append((pm_node_t *) node, &node->parts, part); - node->base.location.end = part->location.end; +static PRISM_INLINE void +pm_interpolated_xstring_node_append(pm_arena_t *arena, pm_interpolated_x_string_node_t *node, pm_node_t *part) { + pm_interpolated_node_append(arena, UP(node), &node->parts, part); + PM_NODE_LENGTH_SET_NODE(node, part); } -static inline void -pm_interpolated_xstring_node_closing_set(pm_interpolated_x_string_node_t *node, const pm_token_t *closing) { - node->closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing); - node->base.location.end = closing->end; +static PRISM_INLINE void +pm_interpolated_xstring_node_closing_set(const pm_parser_t *parser, pm_interpolated_x_string_node_t *node, const pm_token_t *closing) { + node->closing_loc = TOK2LOC(parser, closing); + PM_NODE_LENGTH_SET_TOKEN(parser, node, closing); } /** @@ -5485,17 +5313,12 @@ pm_interpolated_xstring_node_closing_set(pm_interpolated_x_string_node_t *node, */ static pm_it_local_variable_read_node_t * pm_it_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *name) { - pm_it_local_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_it_local_variable_read_node_t); - - *node = (pm_it_local_variable_read_node_t) { - { - .type = PM_IT_LOCAL_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name) - } - }; - - return node; + return pm_it_local_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name) + ); } /** @@ -5503,20 +5326,12 @@ pm_it_local_variable_read_node_create(pm_parser_t *parser, const pm_token_t *nam */ static pm_it_parameters_node_t * pm_it_parameters_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing) { - pm_it_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_it_parameters_node_t); - - *node = (pm_it_parameters_node_t) { - { - .type = PM_IT_PARAMETERS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - } - } - }; - - return node; + return pm_it_parameters_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing) + ); } /** @@ -5524,37 +5339,31 @@ pm_it_parameters_node_create(pm_parser_t *parser, const pm_token_t *opening, con */ static pm_keyword_hash_node_t * pm_keyword_hash_node_create(pm_parser_t *parser) { - pm_keyword_hash_node_t *node = PM_NODE_ALLOC(parser, pm_keyword_hash_node_t); - - *node = (pm_keyword_hash_node_t) { - .base = { - .type = PM_KEYWORD_HASH_NODE, - .flags = PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }, - .elements = { 0 } - }; - - return node; + return pm_keyword_hash_node_new( + parser->arena, + ++parser->node_id, + PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }) + ); } /** * Append an element to a KeywordHashNode node. */ static void -pm_keyword_hash_node_elements_append(pm_keyword_hash_node_t *hash, pm_node_t *element) { +pm_keyword_hash_node_elements_append(pm_arena_t *arena, pm_keyword_hash_node_t *hash, pm_node_t *element) { // If the element being added is not an AssocNode or does not have a symbol // key, then we want to turn the SYMBOL_KEYS flag off. if (!PM_NODE_TYPE_P(element, PM_ASSOC_NODE) || !PM_NODE_TYPE_P(((pm_assoc_node_t *) element)->key, PM_SYMBOL_NODE)) { - pm_node_flag_unset((pm_node_t *)hash, PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS); + pm_node_flag_unset(UP(hash), PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS); } - pm_node_list_append(&hash->elements, element); - if (hash->base.location.start == NULL) { - hash->base.location.start = element->location.start; + pm_node_list_append(arena, &hash->elements, element); + if (PM_NODE_LENGTH(hash) == 0) { + PM_NODE_START_SET_NODE(hash, element); } - hash->base.location.end = element->location.end; + PM_NODE_LENGTH_SET_NODE(hash, element); } /** @@ -5562,22 +5371,14 @@ pm_keyword_hash_node_elements_append(pm_keyword_hash_node_t *hash, pm_node_t *el */ static pm_required_keyword_parameter_node_t * pm_required_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t *name) { - pm_required_keyword_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_required_keyword_parameter_node_t); - - *node = (pm_required_keyword_parameter_node_t) { - { - .type = PM_REQUIRED_KEYWORD_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = name->start, - .end = name->end - }, - }, - .name = pm_parser_constant_id_location(parser, name->start, name->end - 1), - .name_loc = PM_LOCATION_TOKEN_VALUE(name), - }; - - return node; + return pm_required_keyword_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_parser_constant_id_raw(parser, name->start, name->end - 1), + TOK2LOC(parser, name) + ); } /** @@ -5585,23 +5386,15 @@ pm_required_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t */ static pm_optional_keyword_parameter_node_t * pm_optional_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, pm_node_t *value) { - pm_optional_keyword_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_optional_keyword_parameter_node_t); - - *node = (pm_optional_keyword_parameter_node_t) { - { - .type = PM_OPTIONAL_KEYWORD_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = name->start, - .end = value->location.end - }, - }, - .name = pm_parser_constant_id_location(parser, name->start, name->end - 1), - .name_loc = PM_LOCATION_TOKEN_VALUE(name), - .value = value - }; - - return node; + return pm_optional_keyword_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, name, value), + pm_parser_constant_id_raw(parser, name->start, name->end - 1), + TOK2LOC(parser, name), + value + ); } /** @@ -5609,23 +5402,15 @@ pm_optional_keyword_parameter_node_create(pm_parser_t *parser, const pm_token_t */ static pm_keyword_rest_parameter_node_t * pm_keyword_rest_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *name) { - pm_keyword_rest_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_keyword_rest_parameter_node_t); - - *node = (pm_keyword_rest_parameter_node_t) { - { - .type = PM_KEYWORD_REST_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = (name->type == PM_TOKEN_NOT_PROVIDED ? operator->end : name->end) - }, - }, - .name = pm_parser_optional_constant_id_token(parser, name), - .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_keyword_rest_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + (name == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKENS(parser, operator, name), + name == NULL ? 0 : pm_parser_constant_id_token(parser, name), + NTOK2LOC(parser, name), + TOK2LOC(parser, operator) + ); } /** @@ -5641,26 +5426,18 @@ pm_lambda_node_create( pm_node_t *parameters, pm_node_t *body ) { - pm_lambda_node_t *node = PM_NODE_ALLOC(parser, pm_lambda_node_t); - - *node = (pm_lambda_node_t) { - { - .type = PM_LAMBDA_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = closing->end - }, - }, - .locals = *locals, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .parameters = parameters, - .body = body - }; - - return node; + return pm_lambda_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, operator, closing), + *locals, + TOK2LOC(parser, operator), + TOK2LOC(parser, opening), + TOK2LOC(parser, closing), + parameters, + body + ); } /** @@ -5670,25 +5447,18 @@ static pm_local_variable_and_write_node_t * pm_local_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE)); assert(operator->type == PM_TOKEN_AMPERSAND_AMPERSAND_EQUAL); - pm_local_variable_and_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_and_write_node_t); - - *node = (pm_local_variable_and_write_node_t) { - { - .type = PM_LOCAL_VARIABLE_AND_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name_loc = target->location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .name = name, - .depth = depth - }; - return node; + return pm_local_variable_and_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->location, + TOK2LOC(parser, operator), + value, + name, + depth + ); } /** @@ -5696,26 +5466,18 @@ pm_local_variable_and_write_node_create(pm_parser_t *parser, pm_node_t *target, */ static pm_local_variable_operator_write_node_t * pm_local_variable_operator_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { - pm_local_variable_operator_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_operator_write_node_t); - - *node = (pm_local_variable_operator_write_node_t) { - { - .type = PM_LOCAL_VARIABLE_OPERATOR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name_loc = target->location, - .binary_operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .name = name, - .binary_operator = pm_parser_constant_id_location(parser, operator->start, operator->end - 1), - .depth = depth - }; - - return node; + return pm_local_variable_operator_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->location, + TOK2LOC(parser, operator), + value, + name, + pm_parser_constant_id_raw(parser, operator->start, operator->end - 1), + depth + ); } /** @@ -5725,25 +5487,18 @@ static pm_local_variable_or_write_node_t * pm_local_variable_or_write_node_create(pm_parser_t *parser, pm_node_t *target, const pm_token_t *operator, pm_node_t *value, pm_constant_id_t name, uint32_t depth) { assert(PM_NODE_TYPE_P(target, PM_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_IT_LOCAL_VARIABLE_READ_NODE) || PM_NODE_TYPE_P(target, PM_CALL_NODE)); assert(operator->type == PM_TOKEN_PIPE_PIPE_EQUAL); - pm_local_variable_or_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_or_write_node_t); - *node = (pm_local_variable_or_write_node_t) { - { - .type = PM_LOCAL_VARIABLE_OR_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->location.start, - .end = value->location.end - } - }, - .name_loc = target->location, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value, - .name = name, - .depth = depth - }; - - return node; + return pm_local_variable_or_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(target, value), + target->location, + TOK2LOC(parser, operator), + value, + name, + depth + ); } /** @@ -5753,19 +5508,14 @@ static pm_local_variable_read_node_t * pm_local_variable_read_node_create_constant_id(pm_parser_t *parser, const pm_token_t *name, pm_constant_id_t name_id, uint32_t depth, bool missing) { if (!missing) pm_locals_read(&pm_parser_scope_find(parser, depth)->locals, name_id); - pm_local_variable_read_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_read_node_t); - - *node = (pm_local_variable_read_node_t) { - { - .type = PM_LOCAL_VARIABLE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name) - }, - .name = name_id, - .depth = depth - }; - - return node; + return pm_local_variable_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + name_id, + depth + ); } /** @@ -5792,32 +5542,23 @@ pm_local_variable_read_node_missing_create(pm_parser_t *parser, const pm_token_t */ static pm_local_variable_write_node_t * pm_local_variable_write_node_create(pm_parser_t *parser, pm_constant_id_t name, uint32_t depth, pm_node_t *value, const pm_location_t *name_loc, const pm_token_t *operator) { - pm_local_variable_write_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_write_node_t); - - *node = (pm_local_variable_write_node_t) { - { - .type = PM_LOCAL_VARIABLE_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = name_loc->start, - .end = value->location.end - } - }, - .name = name, - .depth = depth, - .value = value, - .name_loc = *name_loc, - .operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_local_variable_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + ((pm_location_t) { .start = name_loc->start, .length = PM_NODE_END(value) - name_loc->start }), + name, + depth, + *name_loc, + value, + TOK2LOC(parser, operator) + ); } /** * Returns true if the given bounds comprise `it`. */ -static inline bool +static PRISM_INLINE bool pm_token_is_it(const uint8_t *start, const uint8_t *end) { return (end - start == 2) && (start[0] == 'i') && (start[1] == 't'); } @@ -5826,19 +5567,24 @@ pm_token_is_it(const uint8_t *start, const uint8_t *end) { * Returns true if the given bounds comprise a numbered parameter (i.e., they * are of the form /^_\d$/). */ -static inline bool -pm_token_is_numbered_parameter(const uint8_t *start, const uint8_t *end) { - return (end - start == 2) && (start[0] == '_') && (start[1] != '0') && (pm_char_is_decimal_digit(start[1])); +static PRISM_INLINE bool +pm_token_is_numbered_parameter(const pm_parser_t *parser, uint32_t start, uint32_t length) { + return ( + (length == 2) && + (parser->start[start] == '_') && + (parser->start[start + 1] != '0') && + pm_char_is_decimal_digit(parser->start[start + 1]) + ); } /** * Ensure the given bounds do not comprise a numbered parameter. If they do, add * an appropriate error message to the parser. */ -static inline void -pm_refute_numbered_parameter(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { - if (pm_token_is_numbered_parameter(start, end)) { - PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_PARAMETER_NUMBERED_RESERVED, start); +static PRISM_INLINE void +pm_refute_numbered_parameter(pm_parser_t *parser, uint32_t start, uint32_t length) { + if (pm_token_is_numbered_parameter(parser, start, length)) { + PM_PARSER_ERR_FORMAT(parser, start, length, PM_ERR_PARAMETER_NUMBERED_RESERVED, parser->start + start); } } @@ -5848,20 +5594,16 @@ pm_refute_numbered_parameter(pm_parser_t *parser, const uint8_t *start, const ui */ static pm_local_variable_target_node_t * pm_local_variable_target_node_create(pm_parser_t *parser, const pm_location_t *location, pm_constant_id_t name, uint32_t depth) { - pm_refute_numbered_parameter(parser, location->start, location->end); - pm_local_variable_target_node_t *node = PM_NODE_ALLOC(parser, pm_local_variable_target_node_t); - - *node = (pm_local_variable_target_node_t) { - { - .type = PM_LOCAL_VARIABLE_TARGET_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = *location - }, - .name = name, - .depth = depth - }; - - return node; + pm_refute_numbered_parameter(parser, location->start, location->length); + + return pm_local_variable_target_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = location->start, .length = location->length }), + name, + depth + ); } /** @@ -5871,23 +5613,15 @@ static pm_match_predicate_node_t * pm_match_predicate_node_create(pm_parser_t *parser, pm_node_t *value, pm_node_t *pattern, const pm_token_t *operator) { pm_assert_value_expression(parser, value); - pm_match_predicate_node_t *node = PM_NODE_ALLOC(parser, pm_match_predicate_node_t); - - *node = (pm_match_predicate_node_t) { - { - .type = PM_MATCH_PREDICATE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = value->location.start, - .end = pattern->location.end - } - }, - .value = value, - .pattern = pattern, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_match_predicate_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(value, pattern), + value, + pattern, + TOK2LOC(parser, operator) + ); } /** @@ -5897,23 +5631,15 @@ static pm_match_required_node_t * pm_match_required_node_create(pm_parser_t *parser, pm_node_t *value, pm_node_t *pattern, const pm_token_t *operator) { pm_assert_value_expression(parser, value); - pm_match_required_node_t *node = PM_NODE_ALLOC(parser, pm_match_required_node_t); - - *node = (pm_match_required_node_t) { - { - .type = PM_MATCH_REQUIRED_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = value->location.start, - .end = pattern->location.end - } - }, - .value = value, - .pattern = pattern, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_match_required_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(value, pattern), + value, + pattern, + TOK2LOC(parser, operator) + ); } /** @@ -5921,19 +5647,14 @@ pm_match_required_node_create(pm_parser_t *parser, pm_node_t *value, pm_node_t * */ static pm_match_write_node_t * pm_match_write_node_create(pm_parser_t *parser, pm_call_node_t *call) { - pm_match_write_node_t *node = PM_NODE_ALLOC(parser, pm_match_write_node_t); - - *node = (pm_match_write_node_t) { - { - .type = PM_MATCH_WRITE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = call->base.location - }, - .call = call, - .targets = { 0 } - }; - - return node; + return pm_match_write_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODE(call), + call, + ((pm_node_list_t) { 0 }) + ); } /** @@ -5941,26 +5662,18 @@ pm_match_write_node_create(pm_parser_t *parser, pm_call_node_t *call) { */ static pm_module_node_t * pm_module_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *module_keyword, pm_node_t *constant_path, const pm_token_t *name, pm_node_t *body, const pm_token_t *end_keyword) { - pm_module_node_t *node = PM_NODE_ALLOC(parser, pm_module_node_t); - - *node = (pm_module_node_t) { - { - .type = PM_MODULE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = module_keyword->start, - .end = end_keyword->end - } - }, - .locals = (locals == NULL ? ((pm_constant_id_list_t) { .ids = NULL, .size = 0, .capacity = 0 }) : *locals), - .module_keyword_loc = PM_LOCATION_TOKEN_VALUE(module_keyword), - .constant_path = constant_path, - .body = body, - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword), - .name = pm_parser_constant_id_token(parser, name) - }; - - return node; + return pm_module_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, module_keyword, end_keyword), + (locals == NULL ? ((pm_constant_id_list_t) { .ids = NULL, .size = 0, .capacity = 0 }) : *locals), + TOK2LOC(parser, module_keyword), + constant_path, + body, + TOK2LOC(parser, end_keyword), + pm_parser_constant_id_token(parser, name) + ); } /** @@ -5968,22 +5681,17 @@ pm_module_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const */ static pm_multi_target_node_t * pm_multi_target_node_create(pm_parser_t *parser) { - pm_multi_target_node_t *node = PM_NODE_ALLOC(parser, pm_multi_target_node_t); - - *node = (pm_multi_target_node_t) { - { - .type = PM_MULTI_TARGET_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = NULL, .end = NULL } - }, - .lefts = { 0 }, - .rest = NULL, - .rights = { 0 }, - .lparen_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .rparen_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + return pm_multi_target_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }), + NULL, + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }) + ); } /** @@ -5996,27 +5704,27 @@ pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t node->rest = target; } else { pm_parser_err_node(parser, target, PM_ERR_MULTI_ASSIGN_MULTI_SPLATS); - pm_node_list_append(&node->rights, target); + pm_node_list_append(parser->arena, &node->rights, target); } } else if (PM_NODE_TYPE_P(target, PM_IMPLICIT_REST_NODE)) { if (node->rest == NULL) { node->rest = target; } else { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST); - pm_node_list_append(&node->rights, target); + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &parser->current, PM_ERR_MULTI_ASSIGN_UNEXPECTED_REST); + pm_node_list_append(parser->arena, &node->rights, target); } } else if (node->rest == NULL) { - pm_node_list_append(&node->lefts, target); + pm_node_list_append(parser->arena, &node->lefts, target); } else { - pm_node_list_append(&node->rights, target); + pm_node_list_append(parser->arena, &node->rights, target); } - if (node->base.location.start == NULL || (node->base.location.start > target->location.start)) { - node->base.location.start = target->location.start; + if (PM_NODE_LENGTH(node) == 0 || (PM_NODE_START(node) > PM_NODE_START(target))) { + PM_NODE_START_SET_NODE(node, target); } - if (node->base.location.end == NULL || (node->base.location.end < target->location.end)) { - node->base.location.end = target->location.end; + if (PM_NODE_LENGTH(node) == 0 || (PM_NODE_END(node) < PM_NODE_END(target))) { + PM_NODE_LENGTH_SET_NODE(node, target); } } @@ -6024,18 +5732,19 @@ pm_multi_target_node_targets_append(pm_parser_t *parser, pm_multi_target_node_t * Set the opening of a MultiTargetNode node. */ static void -pm_multi_target_node_opening_set(pm_multi_target_node_t *node, const pm_token_t *lparen) { - node->base.location.start = lparen->start; - node->lparen_loc = PM_LOCATION_TOKEN_VALUE(lparen); +pm_multi_target_node_opening_set(const pm_parser_t *parser, pm_multi_target_node_t *node, const pm_token_t *lparen) { + PM_NODE_START_SET_TOKEN(parser, node, lparen); + PM_NODE_LENGTH_SET_TOKEN(parser, node, lparen); + node->lparen_loc = TOK2LOC(parser, lparen); } /** * Set the closing of a MultiTargetNode node. */ static void -pm_multi_target_node_closing_set(pm_multi_target_node_t *node, const pm_token_t *rparen) { - node->base.location.end = rparen->end; - node->rparen_loc = PM_LOCATION_TOKEN_VALUE(rparen); +pm_multi_target_node_closing_set(const pm_parser_t *parser, pm_multi_target_node_t *node, const pm_token_t *rparen) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, rparen); + node->rparen_loc = TOK2LOC(parser, rparen); } /** @@ -6043,32 +5752,21 @@ pm_multi_target_node_closing_set(pm_multi_target_node_t *node, const pm_token_t */ static pm_multi_write_node_t * pm_multi_write_node_create(pm_parser_t *parser, pm_multi_target_node_t *target, const pm_token_t *operator, pm_node_t *value) { - pm_multi_write_node_t *node = PM_NODE_ALLOC(parser, pm_multi_write_node_t); - - *node = (pm_multi_write_node_t) { - { - .type = PM_MULTI_WRITE_NODE, - .flags = pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = target->base.location.start, - .end = value->location.end - } - }, - .lefts = target->lefts, - .rest = target->rest, - .rights = target->rights, - .lparen_loc = target->lparen_loc, - .rparen_loc = target->rparen_loc, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - // Explicitly do not call pm_node_destroy here because we want to keep - // around all of the information within the MultiWriteNode node. - xfree(target); - - return node; + /* The target is no longer necessary because we have reused its children. It + * is arena-allocated so no explicit free is needed. */ + return pm_multi_write_node_new( + parser->arena, + ++parser->node_id, + pm_implicit_array_write_flags(value, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY), + PM_LOCATION_INIT_NODES(target, value), + target->lefts, + target->rest, + target->rights, + target->lparen_loc, + target->rparen_loc, + TOK2LOC(parser, operator), + value + ); } /** @@ -6077,22 +5775,15 @@ pm_multi_write_node_create(pm_parser_t *parser, pm_multi_target_node_t *target, static pm_next_node_t * pm_next_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { assert(keyword->type == PM_TOKEN_KEYWORD_NEXT); - pm_next_node_t *node = PM_NODE_ALLOC(parser, pm_next_node_t); - - *node = (pm_next_node_t) { - { - .type = PM_NEXT_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = (arguments == NULL ? keyword->end : arguments->base.location.end) - } - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .arguments = arguments - }; - return node; + return pm_next_node_new( + parser->arena, + ++parser->node_id, + 0, + (arguments == NULL) ? PM_LOCATION_INIT_TOKEN(parser, keyword) : PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, arguments), + arguments, + TOK2LOC(parser, keyword) + ); } /** @@ -6101,16 +5792,31 @@ pm_next_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments static pm_nil_node_t * pm_nil_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_NIL); - pm_nil_node_t *node = PM_NODE_ALLOC(parser, pm_nil_node_t); - *node = (pm_nil_node_t) {{ - .type = PM_NIL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; + return pm_nil_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token) + ); +} - return node; +/** + * Allocate and initialize a new NoKeywordsParameterNode node. + */ +static pm_no_block_parameter_node_t * +pm_no_block_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *keyword) { + assert(operator->type == PM_TOKEN_AMPERSAND || operator->type == PM_TOKEN_UAMPERSAND); + assert(keyword->type == PM_TOKEN_KEYWORD_NIL); + + return pm_no_block_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, operator, keyword), + TOK2LOC(parser, operator), + TOK2LOC(parser, keyword) + ); } /** @@ -6120,41 +5826,29 @@ static pm_no_keywords_parameter_node_t * pm_no_keywords_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *keyword) { assert(operator->type == PM_TOKEN_USTAR_STAR || operator->type == PM_TOKEN_STAR_STAR); assert(keyword->type == PM_TOKEN_KEYWORD_NIL); - pm_no_keywords_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_no_keywords_parameter_node_t); - - *node = (pm_no_keywords_parameter_node_t) { - { - .type = PM_NO_KEYWORDS_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = keyword->end - } - }, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword) - }; - return node; + return pm_no_keywords_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, operator, keyword), + TOK2LOC(parser, operator), + TOK2LOC(parser, keyword) + ); } /** * Allocate and initialize a new NumberedParametersNode node. */ static pm_numbered_parameters_node_t * -pm_numbered_parameters_node_create(pm_parser_t *parser, const pm_location_t *location, uint8_t maximum) { - pm_numbered_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_numbered_parameters_node_t); - - *node = (pm_numbered_parameters_node_t) { - { - .type = PM_NUMBERED_PARAMETERS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = *location - }, - .maximum = maximum - }; - - return node; +pm_numbered_parameters_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *closing, uint8_t maximum) { + return pm_numbered_parameters_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + maximum + ); } /** @@ -6190,14 +5884,14 @@ pm_numbered_reference_read_node_number(pm_parser_t *parser, const pm_token_t *to unsigned long value = strtoul(digits, &endptr, 10); if ((digits == endptr) || (*endptr != '\0')) { - pm_parser_err(parser, start, end, PM_ERR_INVALID_NUMBER_DECIMAL); + pm_parser_err(parser, U32(start - parser->start), U32(length), PM_ERR_INVALID_NUMBER_DECIMAL); value = 0; } - xfree(digits); + xfree_sized(digits, sizeof(char) * (length + 1)); if ((errno == ERANGE) || (value > NTH_REF_MAX)) { - PM_PARSER_WARN_FORMAT(parser, start, end, PM_WARN_INVALID_NUMBERED_REFERENCE, (int) (length + 1), (const char *) token->start); + PM_PARSER_WARN_FORMAT(parser, U32(start - parser->start), U32(length), PM_WARN_INVALID_NUMBERED_REFERENCE, (int) (length + 1), (const char *) token->start); value = 0; } @@ -6212,18 +5906,14 @@ pm_numbered_reference_read_node_number(pm_parser_t *parser, const pm_token_t *to static pm_numbered_reference_read_node_t * pm_numbered_reference_read_node_create(pm_parser_t *parser, const pm_token_t *name) { assert(name->type == PM_TOKEN_NUMBERED_REFERENCE); - pm_numbered_reference_read_node_t *node = PM_NODE_ALLOC(parser, pm_numbered_reference_read_node_t); - *node = (pm_numbered_reference_read_node_t) { - { - .type = PM_NUMBERED_REFERENCE_READ_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(name), - }, - .number = pm_numbered_reference_read_node_number(parser, name) - }; - - return node; + return pm_numbered_reference_read_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, name), + pm_numbered_reference_read_node_number(parser, name) + ); } /** @@ -6231,24 +5921,16 @@ pm_numbered_reference_read_node_create(pm_parser_t *parser, const pm_token_t *na */ static pm_optional_parameter_node_t * pm_optional_parameter_node_create(pm_parser_t *parser, const pm_token_t *name, const pm_token_t *operator, pm_node_t *value) { - pm_optional_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_optional_parameter_node_t); - - *node = (pm_optional_parameter_node_t) { - { - .type = PM_OPTIONAL_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = name->start, - .end = value->location.end - } - }, - .name = pm_parser_constant_id_token(parser, name), - .name_loc = PM_LOCATION_TOKEN_VALUE(name), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .value = value - }; - - return node; + return pm_optional_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, name, value), + pm_parser_constant_id_token(parser, name), + TOK2LOC(parser, name), + TOK2LOC(parser, operator), + value + ); } /** @@ -6258,23 +5940,15 @@ static pm_or_node_t * pm_or_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { pm_assert_value_expression(parser, left); - pm_or_node_t *node = PM_NODE_ALLOC(parser, pm_or_node_t); - - *node = (pm_or_node_t) { - { - .type = PM_OR_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = left->location.start, - .end = right->location.end - } - }, - .left = left, - .right = right, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_or_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(left, right), + left, + right, + TOK2LOC(parser, operator) + ); } /** @@ -6282,24 +5956,19 @@ pm_or_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operat */ static pm_parameters_node_t * pm_parameters_node_create(pm_parser_t *parser) { - pm_parameters_node_t *node = PM_NODE_ALLOC(parser, pm_parameters_node_t); - - *node = (pm_parameters_node_t) { - { - .type = PM_PARAMETERS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(&parser->current) - }, - .rest = NULL, - .keyword_rest = NULL, - .block = NULL, - .requireds = { 0 }, - .optionals = { 0 }, - .posts = { 0 }, - .keywords = { 0 } - }; - - return node; + return pm_parameters_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }), + ((pm_node_list_t) { 0 }), + NULL, + ((pm_node_list_t) { 0 }), + ((pm_node_list_t) { 0 }), + NULL, + NULL + ); } /** @@ -6307,16 +5976,12 @@ pm_parameters_node_create(pm_parser_t *parser) { */ static void pm_parameters_node_location_set(pm_parameters_node_t *params, pm_node_t *param) { - if (params->base.location.start == NULL) { - params->base.location.start = param->location.start; - } else { - params->base.location.start = params->base.location.start < param->location.start ? params->base.location.start : param->location.start; + if ((params->base.location.length == 0) || PM_NODE_START(params) > PM_NODE_START(param)) { + PM_NODE_START_SET_NODE(params, param); } - if (params->base.location.end == NULL) { - params->base.location.end = param->location.end; - } else { - params->base.location.end = params->base.location.end > param->location.end ? params->base.location.end : param->location.end; + if ((params->base.location.length == 0) || (PM_NODE_END(params) < PM_NODE_END(param))) { + PM_NODE_LENGTH_SET_NODE(params, param); } } @@ -6324,27 +5989,27 @@ pm_parameters_node_location_set(pm_parameters_node_t *params, pm_node_t *param) * Append a required parameter to a ParametersNode node. */ static void -pm_parameters_node_requireds_append(pm_parameters_node_t *params, pm_node_t *param) { +pm_parameters_node_requireds_append(pm_arena_t *arena, pm_parameters_node_t *params, pm_node_t *param) { pm_parameters_node_location_set(params, param); - pm_node_list_append(¶ms->requireds, param); + pm_node_list_append(arena, ¶ms->requireds, param); } /** * Append an optional parameter to a ParametersNode node. */ static void -pm_parameters_node_optionals_append(pm_parameters_node_t *params, pm_optional_parameter_node_t *param) { - pm_parameters_node_location_set(params, (pm_node_t *) param); - pm_node_list_append(¶ms->optionals, (pm_node_t *) param); +pm_parameters_node_optionals_append(pm_arena_t *arena, pm_parameters_node_t *params, pm_optional_parameter_node_t *param) { + pm_parameters_node_location_set(params, UP(param)); + pm_node_list_append(arena, ¶ms->optionals, UP(param)); } /** * Append a post optional arguments parameter to a ParametersNode node. */ static void -pm_parameters_node_posts_append(pm_parameters_node_t *params, pm_node_t *param) { +pm_parameters_node_posts_append(pm_arena_t *arena, pm_parameters_node_t *params, pm_node_t *param) { pm_parameters_node_location_set(params, param); - pm_node_list_append(¶ms->posts, param); + pm_node_list_append(arena, ¶ms->posts, param); } /** @@ -6360,9 +6025,9 @@ pm_parameters_node_rest_set(pm_parameters_node_t *params, pm_node_t *param) { * Append a keyword parameter to a ParametersNode node. */ static void -pm_parameters_node_keywords_append(pm_parameters_node_t *params, pm_node_t *param) { +pm_parameters_node_keywords_append(pm_arena_t *arena, pm_parameters_node_t *params, pm_node_t *param) { pm_parameters_node_location_set(params, param); - pm_node_list_append(¶ms->keywords, param); + pm_node_list_append(arena, ¶ms->keywords, param); } /** @@ -6379,9 +6044,9 @@ pm_parameters_node_keyword_rest_set(pm_parameters_node_t *params, pm_node_t *par * Set the block parameter on a ParametersNode node. */ static void -pm_parameters_node_block_set(pm_parameters_node_t *params, pm_block_parameter_node_t *param) { +pm_parameters_node_block_set(pm_parameters_node_t *params, pm_node_t *param) { assert(params->block == NULL); - pm_parameters_node_location_set(params, (pm_node_t *) param); + pm_parameters_node_location_set(params, param); params->block = param; } @@ -6390,22 +6055,14 @@ pm_parameters_node_block_set(pm_parameters_node_t *params, pm_block_parameter_no */ static pm_program_node_t * pm_program_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, pm_statements_node_t *statements) { - pm_program_node_t *node = PM_NODE_ALLOC(parser, pm_program_node_t); - - *node = (pm_program_node_t) { - { - .type = PM_PROGRAM_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = statements == NULL ? parser->start : statements->base.location.start, - .end = statements == NULL ? parser->end : statements->base.location.end - } - }, - .locals = *locals, - .statements = statements - }; - - return node; + return pm_program_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODE(statements), + *locals, + statements + ); } /** @@ -6413,24 +6070,15 @@ pm_program_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, pm_st */ static pm_parentheses_node_t * pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_node_t *body, const pm_token_t *closing, pm_node_flags_t flags) { - pm_parentheses_node_t *node = PM_NODE_ALLOC(parser, pm_parentheses_node_t); - - *node = (pm_parentheses_node_t) { - { - .type = PM_PARENTHESES_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - } - }, - .body = body, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) - }; - - return node; + return pm_parentheses_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + body, + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -6438,24 +6086,16 @@ pm_parentheses_node_create(pm_parser_t *parser, const pm_token_t *opening, pm_no */ static pm_pinned_expression_node_t * pm_pinned_expression_node_create(pm_parser_t *parser, pm_node_t *expression, const pm_token_t *operator, const pm_token_t *lparen, const pm_token_t *rparen) { - pm_pinned_expression_node_t *node = PM_NODE_ALLOC(parser, pm_pinned_expression_node_t); - - *node = (pm_pinned_expression_node_t) { - { - .type = PM_PINNED_EXPRESSION_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = rparen->end - } - }, - .expression = expression, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .lparen_loc = PM_LOCATION_TOKEN_VALUE(lparen), - .rparen_loc = PM_LOCATION_TOKEN_VALUE(rparen) - }; - - return node; + return pm_pinned_expression_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, operator, rparen), + expression, + TOK2LOC(parser, operator), + TOK2LOC(parser, lparen), + TOK2LOC(parser, rparen) + ); } /** @@ -6463,22 +6103,14 @@ pm_pinned_expression_node_create(pm_parser_t *parser, pm_node_t *expression, con */ static pm_pinned_variable_node_t * pm_pinned_variable_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *variable) { - pm_pinned_variable_node_t *node = PM_NODE_ALLOC(parser, pm_pinned_variable_node_t); - - *node = (pm_pinned_variable_node_t) { - { - .type = PM_PINNED_VARIABLE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = variable->location.end - } - }, - .variable = variable, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_pinned_variable_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN_NODE(parser, operator, variable), + variable, + TOK2LOC(parser, operator) + ); } /** @@ -6486,24 +6118,16 @@ pm_pinned_variable_node_create(pm_parser_t *parser, const pm_token_t *operator, */ static pm_post_execution_node_t * pm_post_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { - pm_post_execution_node_t *node = PM_NODE_ALLOC(parser, pm_post_execution_node_t); - - *node = (pm_post_execution_node_t) { - { - .type = PM_POST_EXECUTION_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = closing->end - } - }, - .statements = statements, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) - }; - - return node; + return pm_post_execution_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, keyword, closing), + statements, + TOK2LOC(parser, keyword), + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -6511,24 +6135,16 @@ pm_post_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, co */ static pm_pre_execution_node_t * pm_pre_execution_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *opening, pm_statements_node_t *statements, const pm_token_t *closing) { - pm_pre_execution_node_t *node = PM_NODE_ALLOC(parser, pm_pre_execution_node_t); - - *node = (pm_pre_execution_node_t) { - { - .type = PM_PRE_EXECUTION_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = closing->end - } - }, - .statements = statements, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing) - }; - - return node; + return pm_pre_execution_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, keyword, closing), + statements, + TOK2LOC(parser, keyword), + TOK2LOC(parser, opening), + TOK2LOC(parser, closing) + ); } /** @@ -6538,8 +6154,6 @@ static pm_range_node_t * pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *operator, pm_node_t *right) { pm_assert_value_expression(parser, left); pm_assert_value_expression(parser, right); - - pm_range_node_t *node = PM_NODE_ALLOC(parser, pm_range_node_t); pm_node_flags_t flags = 0; // Indicate that this node is an exclusive range if the operator is `...`. @@ -6557,22 +6171,18 @@ pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *ope flags |= PM_NODE_FLAG_STATIC_LITERAL; } - *node = (pm_range_node_t) { - { - .type = PM_RANGE_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = (left == NULL ? operator->start : left->location.start), - .end = (right == NULL ? operator->end : right->location.end) - } - }, - .left = left, - .right = right, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; + uint32_t start = left == NULL ? PM_TOKEN_START(parser, operator) : PM_NODE_START(left); + uint32_t end = right == NULL ? PM_TOKEN_END(parser, operator) : PM_NODE_END(right); - return node; + return pm_range_node_new( + parser->arena, + ++parser->node_id, + flags, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + left, + right, + TOK2LOC(parser, operator) + ); } /** @@ -6581,15 +6191,13 @@ pm_range_node_create(pm_parser_t *parser, pm_node_t *left, const pm_token_t *ope static pm_redo_node_t * pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_REDO); - pm_redo_node_t *node = PM_NODE_ALLOC(parser, pm_redo_node_t); - *node = (pm_redo_node_t) {{ - .type = PM_REDO_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - - return node; + return pm_redo_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -6598,31 +6206,22 @@ pm_redo_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_regular_expression_node_t * pm_regular_expression_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { - pm_regular_expression_node_t *node = PM_NODE_ALLOC(parser, pm_regular_expression_node_t); - - *node = (pm_regular_expression_node_t) { - { - .type = PM_REGULAR_EXPRESSION_NODE, - .flags = pm_regular_expression_flags_create(parser, closing) | PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = MIN(opening->start, closing->start), - .end = MAX(opening->end, closing->end) - } - }, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .content_loc = PM_LOCATION_TOKEN_VALUE(content), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .unescaped = *unescaped - }; - - return node; + return pm_regular_expression_node_new( + parser->arena, + ++parser->node_id, + pm_regular_expression_flags_create(parser, closing) | PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + TOK2LOC(parser, opening), + TOK2LOC(parser, content), + TOK2LOC(parser, closing), + *unescaped + ); } /** * Allocate a new initialize a new RegularExpressionNode node. */ -static inline pm_regular_expression_node_t * +static PRISM_INLINE pm_regular_expression_node_t * pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { return pm_regular_expression_node_create_unescaped(parser, opening, content, closing, &PM_STRING_EMPTY); } @@ -6632,18 +6231,13 @@ pm_regular_expression_node_create(pm_parser_t *parser, const pm_token_t *opening */ static pm_required_parameter_node_t * pm_required_parameter_node_create(pm_parser_t *parser, const pm_token_t *token) { - pm_required_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_required_parameter_node_t); - - *node = (pm_required_parameter_node_t) { - { - .type = PM_REQUIRED_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }, - .name = pm_parser_constant_id_token(parser, token) - }; - - return node; + return pm_required_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token), + pm_parser_constant_id_token(parser, token) + ); } /** @@ -6651,23 +6245,15 @@ pm_required_parameter_node_create(pm_parser_t *parser, const pm_token_t *token) */ static pm_rescue_modifier_node_t * pm_rescue_modifier_node_create(pm_parser_t *parser, pm_node_t *expression, const pm_token_t *keyword, pm_node_t *rescue_expression) { - pm_rescue_modifier_node_t *node = PM_NODE_ALLOC(parser, pm_rescue_modifier_node_t); - - *node = (pm_rescue_modifier_node_t) { - { - .type = PM_RESCUE_MODIFIER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = expression->location.start, - .end = rescue_expression->location.end - } - }, - .expression = expression, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .rescue_expression = rescue_expression - }; - - return node; + return pm_rescue_modifier_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_NODES(expression, rescue_expression), + expression, + TOK2LOC(parser, keyword), + rescue_expression + ); } /** @@ -6675,29 +6261,24 @@ pm_rescue_modifier_node_create(pm_parser_t *parser, pm_node_t *expression, const */ static pm_rescue_node_t * pm_rescue_node_create(pm_parser_t *parser, const pm_token_t *keyword) { - pm_rescue_node_t *node = PM_NODE_ALLOC(parser, pm_rescue_node_t); - - *node = (pm_rescue_node_t) { - { - .type = PM_RESCUE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(keyword) - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .operator_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .reference = NULL, - .statements = NULL, - .subsequent = NULL, - .exceptions = { 0 } - }; - - return node; + return pm_rescue_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, keyword), + TOK2LOC(parser, keyword), + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + NULL, + ((pm_location_t) { 0 }), + NULL, + NULL + ); } -static inline void -pm_rescue_node_operator_set(pm_rescue_node_t *node, const pm_token_t *operator) { - node->operator_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(operator); +static PRISM_INLINE void +pm_rescue_node_operator_set(const pm_parser_t *parser, pm_rescue_node_t *node, const pm_token_t *operator) { + node->operator_loc = TOK2LOC(parser, operator); } /** @@ -6706,7 +6287,7 @@ pm_rescue_node_operator_set(pm_rescue_node_t *node, const pm_token_t *operator) static void pm_rescue_node_reference_set(pm_rescue_node_t *node, pm_node_t *reference) { node->reference = reference; - node->base.location.end = reference->location.end; + PM_NODE_LENGTH_SET_NODE(node, reference); } /** @@ -6716,7 +6297,7 @@ static void pm_rescue_node_statements_set(pm_rescue_node_t *node, pm_statements_node_t *statements) { node->statements = statements; if (pm_statements_node_body_length(statements) > 0) { - node->base.location.end = statements->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, statements); } } @@ -6726,16 +6307,16 @@ pm_rescue_node_statements_set(pm_rescue_node_t *node, pm_statements_node_t *stat static void pm_rescue_node_subsequent_set(pm_rescue_node_t *node, pm_rescue_node_t *subsequent) { node->subsequent = subsequent; - node->base.location.end = subsequent->base.location.end; + PM_NODE_LENGTH_SET_NODE(node, subsequent); } /** * Append an exception node to a rescue node, and update the location. */ static void -pm_rescue_node_exceptions_append(pm_rescue_node_t *node, pm_node_t *exception) { - pm_node_list_append(&node->exceptions, exception); - node->base.location.end = exception->location.end; +pm_rescue_node_exceptions_append(pm_arena_t *arena, pm_rescue_node_t *node, pm_node_t *exception) { + pm_node_list_append(arena, &node->exceptions, exception); + PM_NODE_LENGTH_SET_NODE(node, exception); } /** @@ -6743,23 +6324,15 @@ pm_rescue_node_exceptions_append(pm_rescue_node_t *node, pm_node_t *exception) { */ static pm_rest_parameter_node_t * pm_rest_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, const pm_token_t *name) { - pm_rest_parameter_node_t *node = PM_NODE_ALLOC(parser, pm_rest_parameter_node_t); - - *node = (pm_rest_parameter_node_t) { - { - .type = PM_REST_PARAMETER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = (name->type == PM_TOKEN_NOT_PROVIDED ? operator->end : name->end) - } - }, - .name = pm_parser_optional_constant_id_token(parser, name), - .name_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(name), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator) - }; - - return node; + return pm_rest_parameter_node_new( + parser->arena, + ++parser->node_id, + 0, + (name == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKENS(parser, operator, name), + name == NULL ? 0 : pm_parser_constant_id_token(parser, name), + NTOK2LOC(parser, name), + TOK2LOC(parser, operator) + ); } /** @@ -6768,15 +6341,13 @@ pm_rest_parameter_node_create(pm_parser_t *parser, const pm_token_t *operator, c static pm_retry_node_t * pm_retry_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_RETRY); - pm_retry_node_t *node = PM_NODE_ALLOC(parser, pm_retry_node_t); - *node = (pm_retry_node_t) {{ - .type = PM_RETRY_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - - return node; + return pm_retry_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -6784,22 +6355,14 @@ pm_retry_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_return_node_t * pm_return_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_node_t *arguments) { - pm_return_node_t *node = PM_NODE_ALLOC(parser, pm_return_node_t); - - *node = (pm_return_node_t) { - { - .type = PM_RETURN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = (arguments == NULL ? keyword->end : arguments->base.location.end) - } - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .arguments = arguments - }; - - return node; + return pm_return_node_new( + parser->arena, + ++parser->node_id, + 0, + (arguments == NULL) ? PM_LOCATION_INIT_TOKEN(parser, keyword) : PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, arguments), + TOK2LOC(parser, keyword), + arguments + ); } /** @@ -6808,15 +6371,13 @@ pm_return_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argumen static pm_self_node_t * pm_self_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_SELF); - pm_self_node_t *node = PM_NODE_ALLOC(parser, pm_self_node_t); - *node = (pm_self_node_t) {{ - .type = PM_SELF_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - - return node; + return pm_self_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -6824,19 +6385,13 @@ pm_self_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_shareable_constant_node_t * pm_shareable_constant_node_create(pm_parser_t *parser, pm_node_t *write, pm_shareable_constant_value_t value) { - pm_shareable_constant_node_t *node = PM_NODE_ALLOC(parser, pm_shareable_constant_node_t); - - *node = (pm_shareable_constant_node_t) { - { - .type = PM_SHAREABLE_CONSTANT_NODE, - .flags = (pm_node_flags_t) value, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NODE_VALUE(write) - }, - .write = write - }; - - return node; + return pm_shareable_constant_node_new( + parser->arena, + ++parser->node_id, + (pm_node_flags_t) value, + PM_LOCATION_INIT_NODE(write), + write + ); } /** @@ -6844,26 +6399,18 @@ pm_shareable_constant_node_create(pm_parser_t *parser, pm_node_t *write, pm_shar */ static pm_singleton_class_node_t * pm_singleton_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *locals, const pm_token_t *class_keyword, const pm_token_t *operator, pm_node_t *expression, pm_node_t *body, const pm_token_t *end_keyword) { - pm_singleton_class_node_t *node = PM_NODE_ALLOC(parser, pm_singleton_class_node_t); - - *node = (pm_singleton_class_node_t) { - { - .type = PM_SINGLETON_CLASS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = class_keyword->start, - .end = end_keyword->end - } - }, - .locals = *locals, - .class_keyword_loc = PM_LOCATION_TOKEN_VALUE(class_keyword), - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .expression = expression, - .body = body, - .end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword) - }; - - return node; + return pm_singleton_class_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKENS(parser, class_keyword, end_keyword), + *locals, + TOK2LOC(parser, class_keyword), + TOK2LOC(parser, operator), + expression, + body, + TOK2LOC(parser, end_keyword) + ); } /** @@ -6872,16 +6419,13 @@ pm_singleton_class_node_create(pm_parser_t *parser, pm_constant_id_list_t *local static pm_source_encoding_node_t * pm_source_encoding_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD___ENCODING__); - pm_source_encoding_node_t *node = PM_NODE_ALLOC(parser, pm_source_encoding_node_t); - - *node = (pm_source_encoding_node_t) {{ - .type = PM_SOURCE_ENCODING_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - return node; + return pm_source_encoding_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -6889,7 +6433,6 @@ pm_source_encoding_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_source_file_node_t* pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword) { - pm_source_file_node_t *node = PM_NODE_ALLOC(parser, pm_source_file_node_t); assert(file_keyword->type == PM_TOKEN_KEYWORD___FILE__); pm_node_flags_t flags = 0; @@ -6903,17 +6446,13 @@ pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword) break; } - *node = (pm_source_file_node_t) { - { - .type = PM_SOURCE_FILE_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(file_keyword), - }, - .filepath = parser->filepath - }; - - return node; + return pm_source_file_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_TOKEN(parser, file_keyword), + parser->filepath + ); } /** @@ -6922,16 +6461,13 @@ pm_source_file_node_create(pm_parser_t *parser, const pm_token_t *file_keyword) static pm_source_line_node_t * pm_source_line_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD___LINE__); - pm_source_line_node_t *node = PM_NODE_ALLOC(parser, pm_source_line_node_t); - *node = (pm_source_line_node_t) {{ - .type = PM_SOURCE_LINE_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - - return node; + return pm_source_line_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -6939,22 +6475,14 @@ pm_source_line_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_splat_node_t * pm_splat_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t *expression) { - pm_splat_node_t *node = PM_NODE_ALLOC(parser, pm_splat_node_t); - - *node = (pm_splat_node_t) { - { - .type = PM_SPLAT_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = operator->start, - .end = (expression == NULL ? operator->end : expression->location.end) - } - }, - .operator_loc = PM_LOCATION_TOKEN_VALUE(operator), - .expression = expression - }; - - return node; + return pm_splat_node_new( + parser->arena, + ++parser->node_id, + 0, + (expression == NULL) ? PM_LOCATION_INIT_TOKEN(parser, operator) : PM_LOCATION_INIT_TOKEN_NODE(parser, operator, expression), + TOK2LOC(parser, operator), + expression + ); } /** @@ -6962,18 +6490,13 @@ pm_splat_node_create(pm_parser_t *parser, const pm_token_t *operator, pm_node_t */ static pm_statements_node_t * pm_statements_node_create(pm_parser_t *parser) { - pm_statements_node_t *node = PM_NODE_ALLOC(parser, pm_statements_node_t); - - *node = (pm_statements_node_t) { - { - .type = PM_STATEMENTS_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .body = { 0 } - }; - - return node; + return pm_statements_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + ((pm_node_list_t) { 0 }) + ); } /** @@ -6985,25 +6508,17 @@ pm_statements_node_body_length(pm_statements_node_t *node) { } /** - * Set the location of the given StatementsNode. - */ -static void -pm_statements_node_location_set(pm_statements_node_t *node, const uint8_t *start, const uint8_t *end) { - node->base.location = (pm_location_t) { .start = start, .end = end }; -} - -/** * Update the location of the statements node based on the statement that is * being added to the list. */ -static inline void +static PRISM_INLINE void pm_statements_node_body_update(pm_statements_node_t *node, pm_node_t *statement) { - if (pm_statements_node_body_length(node) == 0 || statement->location.start < node->base.location.start) { - node->base.location.start = statement->location.start; + if (pm_statements_node_body_length(node) == 0 || PM_NODE_START(statement) < PM_NODE_START(node)) { + PM_NODE_START_SET_NODE(node, statement); } - if (statement->location.end > node->base.location.end) { - node->base.location.end = statement->location.end; + if (PM_NODE_END(statement) > PM_NODE_END(node)) { + PM_NODE_LENGTH_SET_NODE(node, statement); } } @@ -7030,7 +6545,7 @@ pm_statements_node_body_append(pm_parser_t *parser, pm_statements_node_t *node, } } - pm_node_list_append(&node->body, statement); + pm_node_list_append(parser->arena, &node->body, statement); if (newline) pm_node_flag_set(statement, PM_NODE_FLAG_NEWLINE); } @@ -7038,18 +6553,17 @@ pm_statements_node_body_append(pm_parser_t *parser, pm_statements_node_t *node, * Prepend a new node to the given StatementsNode node's body. */ static void -pm_statements_node_body_prepend(pm_statements_node_t *node, pm_node_t *statement) { +pm_statements_node_body_prepend(pm_arena_t *arena, pm_statements_node_t *node, pm_node_t *statement) { pm_statements_node_body_update(node, statement); - pm_node_list_prepend(&node->body, statement); + pm_node_list_prepend(arena, &node->body, statement); pm_node_flag_set(statement, PM_NODE_FLAG_NEWLINE); } /** * Allocate a new StringNode node with the current string on the parser. */ -static inline pm_string_node_t * +static PRISM_INLINE pm_string_node_t * pm_string_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *string) { - pm_string_node_t *node = PM_NODE_ALLOC(parser, pm_string_node_t); pm_node_flags_t flags = 0; switch (parser->frozen_string_literal) { @@ -7061,23 +6575,19 @@ pm_string_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, break; } - *node = (pm_string_node_t) { - { - .type = PM_STRING_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = (opening->type == PM_TOKEN_NOT_PROVIDED ? content->start : opening->start), - .end = (closing->type == PM_TOKEN_NOT_PROVIDED ? content->end : closing->end) - } - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .content_loc = PM_LOCATION_TOKEN_VALUE(content), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .unescaped = *string - }; + uint32_t start = PM_TOKEN_START(parser, opening == NULL ? content : opening); + uint32_t end = PM_TOKEN_END(parser, closing == NULL ? content : closing); - return node; + return pm_string_node_new( + parser->arena, + ++parser->node_id, + flags, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NTOK2LOC(parser, opening), + TOK2LOC(parser, content), + NTOK2LOC(parser, closing), + *string + ); } /** @@ -7105,30 +6615,21 @@ pm_string_node_create_current_string(pm_parser_t *parser, const pm_token_t *open static pm_super_node_t * pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_arguments_t *arguments) { assert(keyword->type == PM_TOKEN_KEYWORD_SUPER); - pm_super_node_t *node = PM_NODE_ALLOC(parser, pm_super_node_t); - - const uint8_t *end = pm_arguments_end(arguments); - if (end == NULL) { - assert(false && "unreachable"); - } - *node = (pm_super_node_t) { - { - .type = PM_SUPER_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = end, - } - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .lparen_loc = arguments->opening_loc, - .arguments = arguments->arguments, - .rparen_loc = arguments->closing_loc, - .block = arguments->block - }; - - return node; + const pm_location_t *end = pm_arguments_end(arguments); + assert(end != NULL && "unreachable"); + + return pm_super_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = PM_TOKEN_START(parser, keyword), .length = PM_LOCATION_END(end) - PM_TOKEN_START(parser, keyword) }), + TOK2LOC(parser, keyword), + arguments->opening_loc, + arguments->arguments, + arguments->closing_loc, + arguments->block + ); } /** @@ -7156,7 +6657,7 @@ parse_symbol_encoding_validate_utf8(pm_parser_t *parser, const pm_token_t *locat size_t width = pm_encoding_utf_8_char_width(cursor, end - cursor); if (width == 0) { - pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL); + pm_parser_err(parser, PM_TOKEN_START(parser, location), PM_TOKEN_LENGTH(location), PM_ERR_INVALID_SYMBOL); break; } @@ -7176,7 +6677,7 @@ parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *loca size_t width = encoding->char_width(cursor, end - cursor); if (width == 0) { - pm_parser_err(parser, location->start, location->end, PM_ERR_INVALID_SYMBOL); + pm_parser_err(parser, PM_TOKEN_START(parser, location), PM_TOKEN_LENGTH(location), PM_ERR_INVALID_SYMBOL); break; } @@ -7193,7 +6694,7 @@ parse_symbol_encoding_validate_other(pm_parser_t *parser, const pm_token_t *loca * If the validate flag is set, then it will check the contents of the symbol * to ensure that all characters are valid in the encoding. */ -static inline pm_node_flags_t +static PRISM_INLINE pm_node_flags_t parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_string_t *contents, bool validate) { if (parser->explicit_encoding != NULL) { // A Symbol may optionally have its encoding explicitly set. This will @@ -7218,160 +6719,31 @@ parse_symbol_encoding(pm_parser_t *parser, const pm_token_t *location, const pm_ return 0; } -static pm_node_flags_t -parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { - assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || - (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || - (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) || - (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY)); - - // There's special validation logic used if a string does not contain any character escape sequences. - if (parser->explicit_encoding == NULL) { - // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp - // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to - // the US-ASCII encoding. - if (ascii_only) { - return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; - } - - if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { - if (!ascii_only) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); - } - } else if (parser->encoding != modifier_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); - - if (modifier == 'n' && !ascii_only) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } - - return flags; - } - - // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile. - bool mixed_encoding = false; - - if (mixed_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { - // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily. - bool valid_string_in_modifier_encoding = true; - - if (!valid_string_in_modifier_encoding) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now. - if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, (int) pm_string_length(source), (const char *) pm_string_source(source)); - } - } - - // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else. - return flags; -} - -/** - * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and - * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even - * when the Regexp is explicitly given an ASCII-8BIT encoding via the (/n) modifier. Otherwise, the encoding - * may be explicitly set with an escape sequence. - */ -static pm_node_flags_t -parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) { - // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. - bool valid_unicode_range = true; - if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, (int) pm_string_length(source), (const char *) pm_string_source(source)); - return flags; - } - - // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding - // to multi-byte characters are allowed. - if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { - // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the - // following error message appearing twice. We do the same for compatibility. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); - } - - /** - * Start checking modifier flags. We need to process these before considering any explicit encodings that may have - * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the - * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set - * the corresponding "forced_<encoding>" flag. Instead, the caller should check the encoding modifier flag and - * determine the encoding that way. - */ - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); - } - - if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { - return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); - } - - // At this point no encoding modifiers will be present on the regular expression as they would have already - // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all - // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII. - if (ascii_only) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; - } - - // A Regexp may optionally have its encoding explicitly set via a character escape sequence in the source string - // or by specifying a modifier. - // - // NB: an explicitly set encoding is ignored by Ruby if the Regexp consists of only US ASCII code points. - if (parser->explicit_encoding != NULL) { - if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; - } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { - return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; - } - } - - return 0; -} - /** * Allocate and initialize a new SymbolNode node with the given unescaped * string. */ static pm_symbol_node_t * pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) { - pm_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_symbol_node_t); - - *node = (pm_symbol_node_t) { - { - .type = PM_SYMBOL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL | flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start), - .end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end) - } - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .value_loc = PM_LOCATION_TOKEN_VALUE(value), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .unescaped = *unescaped - }; - - return node; + uint32_t start = opening == NULL ? PM_TOKEN_START(parser, value) : PM_TOKEN_START(parser, opening); + uint32_t end = closing == NULL ? PM_TOKEN_END(parser, value) : PM_TOKEN_END(parser, closing); + + return pm_symbol_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL | flags, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + NTOK2LOC(parser, opening), + NTOK2LOC(parser, value), + NTOK2LOC(parser, closing), + *unescaped + ); } /** * Allocate and initialize a new SymbolNode node. */ -static inline pm_symbol_node_t * +static PRISM_INLINE pm_symbol_node_t * pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0); } @@ -7391,35 +6763,15 @@ pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *open */ static pm_symbol_node_t * pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { - pm_symbol_node_t *node; - - switch (token->type) { - case PM_TOKEN_LABEL: { - pm_token_t opening = not_provided(parser); - pm_token_t closing = { .type = PM_TOKEN_LABEL_END, .start = token->end - 1, .end = token->end }; + assert(token->type == PM_TOKEN_LABEL); - pm_token_t label = { .type = PM_TOKEN_LABEL, .start = token->start, .end = token->end - 1 }; - node = pm_symbol_node_create(parser, &opening, &label, &closing); + pm_token_t closing = { .type = PM_TOKEN_LABEL_END, .start = token->end - 1, .end = token->end }; + pm_token_t label = { .type = PM_TOKEN_LABEL, .start = token->start, .end = token->end - 1 }; + pm_symbol_node_t *node = pm_symbol_node_create(parser, NULL, &label, &closing); - assert((label.end - label.start) >= 0); - pm_string_shared_init(&node->unescaped, label.start, label.end); - pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &label, &node->unescaped, false)); - - break; - } - case PM_TOKEN_MISSING: { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - pm_token_t label = { .type = PM_TOKEN_LABEL, .start = token->start, .end = token->end }; - node = pm_symbol_node_create(parser, &opening, &label, &closing); - break; - } - default: - assert(false && "unreachable"); - node = NULL; - break; - } + assert((label.end - label.start) >= 0); + pm_string_shared_init(&node->unescaped, label.start, label.end); + pm_node_flag_set(UP(node), parse_symbol_encoding(parser, &label, &node->unescaped, false)); return node; } @@ -7429,18 +6781,16 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_symbol_node_t * pm_symbol_node_synthesized_create(pm_parser_t *parser, const char *content) { - pm_symbol_node_t *node = PM_NODE_ALLOC(parser, pm_symbol_node_t); - - *node = (pm_symbol_node_t) { - { - .type = PM_SYMBOL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL | PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .value_loc = PM_LOCATION_NULL_VALUE(parser), - .unescaped = { 0 } - }; + pm_symbol_node_t *node = pm_symbol_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL | PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING, + PM_LOCATION_INIT_UNSET, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_string_t) { 0 }) + ); pm_string_constant_init(&node->unescaped, content, strlen(content)); return node; @@ -7450,21 +6800,29 @@ pm_symbol_node_synthesized_create(pm_parser_t *parser, const char *content) { * Check if the given node is a label in a hash. */ static bool -pm_symbol_node_label_p(pm_node_t *node) { - const uint8_t *end = NULL; +pm_symbol_node_label_p(const pm_parser_t *parser, const pm_node_t *node) { + const pm_location_t *location = NULL; switch (PM_NODE_TYPE(node)) { - case PM_SYMBOL_NODE: - end = ((pm_symbol_node_t *) node)->closing_loc.end; + case PM_SYMBOL_NODE: { + const pm_symbol_node_t *cast = (pm_symbol_node_t *) node; + if (cast->closing_loc.length > 0) { + location = &cast->closing_loc; + } break; - case PM_INTERPOLATED_SYMBOL_NODE: - end = ((pm_interpolated_symbol_node_t *) node)->closing_loc.end; + } + case PM_INTERPOLATED_SYMBOL_NODE: { + const pm_interpolated_symbol_node_t *cast = (pm_interpolated_symbol_node_t *) node; + if (cast->closing_loc.length > 0) { + location = &cast->closing_loc; + } break; + } default: return false; } - return (end != NULL) && (end[-1] == ':'); + return (location != NULL) && (parser->start[PM_LOCATION_END(location) - 1] == ':'); } /** @@ -7472,32 +6830,26 @@ pm_symbol_node_label_p(pm_node_t *node) { */ static pm_symbol_node_t * pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const pm_token_t *opening, const pm_token_t *closing) { - pm_symbol_node_t *new_node = PM_NODE_ALLOC(parser, pm_symbol_node_t); + pm_symbol_node_t *new_node = pm_symbol_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + TOK2LOC(parser, opening), + node->content_loc, + TOK2LOC(parser, closing), + node->unescaped + ); - *new_node = (pm_symbol_node_t) { - { - .type = PM_SYMBOL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - } - }, - .opening_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(opening), - .value_loc = node->content_loc, - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .unescaped = node->unescaped + pm_token_t content = { + .type = PM_TOKEN_IDENTIFIER, + .start = parser->start + node->content_loc.start, + .end = parser->start + node->content_loc.start + node->content_loc.length }; - pm_token_t content = { .type = PM_TOKEN_IDENTIFIER, .start = node->content_loc.start, .end = node->content_loc.end }; - pm_node_flag_set((pm_node_t *) new_node, parse_symbol_encoding(parser, &content, &node->unescaped, true)); - - // We are explicitly _not_ using pm_node_destroy here because we don't want - // to trash the unescaped string. We could instead copy the string if we - // know that it is owned, but we're taking the fast path for now. - xfree(node); + pm_node_flag_set(UP(new_node), parse_symbol_encoding(parser, &content, &node->unescaped, true)); + /* The old node is arena-allocated so no explicit free is needed. */ return new_node; } @@ -7506,7 +6858,6 @@ pm_string_node_to_symbol_node(pm_parser_t *parser, pm_string_node_t *node, const */ static pm_string_node_t * pm_symbol_node_to_string_node(pm_parser_t *parser, pm_symbol_node_t *node) { - pm_string_node_t *new_node = PM_NODE_ALLOC(parser, pm_string_node_t); pm_node_flags_t flags = 0; switch (parser->frozen_string_literal) { @@ -7518,24 +6869,18 @@ pm_symbol_node_to_string_node(pm_parser_t *parser, pm_symbol_node_t *node) { break; } - *new_node = (pm_string_node_t) { - { - .type = PM_STRING_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = node->base.location - }, - .opening_loc = node->opening_loc, - .content_loc = node->value_loc, - .closing_loc = node->closing_loc, - .unescaped = node->unescaped - }; - - // We are explicitly _not_ using pm_node_destroy here because we don't want - // to trash the unescaped string. We could instead copy the string if we - // know that it is owned, but we're taking the fast path for now. - xfree(node); + pm_string_node_t *new_node = pm_string_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_NODE(node), + node->opening_loc, + node->value_loc, + node->closing_loc, + node->unescaped + ); + /* The old node is arena-allocated so no explicit free is needed. */ return new_node; } @@ -7545,16 +6890,13 @@ pm_symbol_node_to_string_node(pm_parser_t *parser, pm_symbol_node_t *node) { static pm_true_node_t * pm_true_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_TRUE); - pm_true_node_t *node = PM_NODE_ALLOC(parser, pm_true_node_t); - - *node = (pm_true_node_t) {{ - .type = PM_TRUE_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token) - }}; - return node; + return pm_true_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_TOKEN(parser, token) + ); } /** @@ -7562,16 +6904,12 @@ pm_true_node_create(pm_parser_t *parser, const pm_token_t *token) { */ static pm_true_node_t * pm_true_node_synthesized_create(pm_parser_t *parser) { - pm_true_node_t *node = PM_NODE_ALLOC(parser, pm_true_node_t); - - *node = (pm_true_node_t) {{ - .type = PM_TRUE_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { .start = parser->start, .end = parser->end } - }}; - - return node; + return pm_true_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_STATIC_LITERAL, + PM_LOCATION_INIT_UNSET + ); } /** @@ -7580,28 +6918,24 @@ pm_true_node_synthesized_create(pm_parser_t *parser) { static pm_undef_node_t * pm_undef_node_create(pm_parser_t *parser, const pm_token_t *token) { assert(token->type == PM_TOKEN_KEYWORD_UNDEF); - pm_undef_node_t *node = PM_NODE_ALLOC(parser, pm_undef_node_t); - - *node = (pm_undef_node_t) { - { - .type = PM_UNDEF_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_TOKEN_VALUE(token), - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(token), - .names = { 0 } - }; - return node; + return pm_undef_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, token), + ((pm_node_list_t) { 0 }), + TOK2LOC(parser, token) + ); } /** * Append a name to an undef node. */ static void -pm_undef_node_append(pm_undef_node_t *node, pm_node_t *name) { - node->base.location.end = name->location.end; - pm_node_list_append(&node->names, name); +pm_undef_node_append(pm_arena_t *arena, pm_undef_node_t *node, pm_node_t *name) { + PM_NODE_LENGTH_SET_NODE(node, name); + pm_node_list_append(arena, &node->names, name); } /** @@ -7610,34 +6944,20 @@ pm_undef_node_append(pm_undef_node_t *node, pm_node_t *name) { static pm_unless_node_t * pm_unless_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, const pm_token_t *then_keyword, pm_statements_node_t *statements) { pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - pm_unless_node_t *node = PM_NODE_ALLOC(parser, pm_unless_node_t); - - const uint8_t *end; - if (statements != NULL) { - end = statements->base.location.end; - } else { - end = predicate->location.end; - } - - *node = (pm_unless_node_t) { - { - .type = PM_UNLESS_NODE, - .flags = PM_NODE_FLAG_NEWLINE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = end - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .predicate = predicate, - .then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(then_keyword), - .statements = statements, - .else_clause = NULL, - .end_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + pm_node_t *end = statements == NULL ? predicate : UP(statements); + + return pm_unless_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_NEWLINE, + PM_LOCATION_INIT_TOKEN_NODE(parser, keyword, end), + TOK2LOC(parser, keyword), + predicate, + NTOK2LOC(parser, then_keyword), + statements, + NULL, + ((pm_location_t) { 0 }) + ); } /** @@ -7646,36 +6966,28 @@ pm_unless_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t static pm_unless_node_t * pm_unless_node_modifier_create(pm_parser_t *parser, pm_node_t *statement, const pm_token_t *unless_keyword, pm_node_t *predicate) { pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - pm_unless_node_t *node = PM_NODE_ALLOC(parser, pm_unless_node_t); pm_statements_node_t *statements = pm_statements_node_create(parser); pm_statements_node_body_append(parser, statements, statement, true); - *node = (pm_unless_node_t) { - { - .type = PM_UNLESS_NODE, - .flags = PM_NODE_FLAG_NEWLINE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = statement->location.start, - .end = predicate->location.end - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(unless_keyword), - .predicate = predicate, - .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .statements = statements, - .else_clause = NULL, - .end_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE - }; - - return node; + return pm_unless_node_new( + parser->arena, + ++parser->node_id, + PM_NODE_FLAG_NEWLINE, + PM_LOCATION_INIT_NODES(statement, predicate), + TOK2LOC(parser, unless_keyword), + predicate, + ((pm_location_t) { 0 }), + statements, + NULL, + ((pm_location_t) { 0 }) + ); } -static inline void -pm_unless_node_end_keyword_loc_set(pm_unless_node_t *node, const pm_token_t *end_keyword) { - node->end_keyword_loc = PM_LOCATION_TOKEN_VALUE(end_keyword); - node->base.location.end = end_keyword->end; +static PRISM_INLINE void +pm_unless_node_end_keyword_loc_set(const pm_parser_t *parser, pm_unless_node_t *node, const pm_token_t *end_keyword) { + node->end_keyword_loc = TOK2LOC(parser, end_keyword); + PM_NODE_LENGTH_SET_TOKEN(parser, node, end_keyword); } /** @@ -7690,7 +7002,7 @@ pm_loop_modifier_block_exits(pm_parser_t *parser, pm_statements_node_t *statemen // All of the block exits that we want to remove should be within the // statements, and since we are modifying the statements, we shouldn't have // to check the end location. - const uint8_t *start = statements->base.location.start; + uint32_t start = statements->base.location.start; for (size_t index = parser->current_block_exits->size; index > 0; index--) { pm_node_t *block_exit = parser->current_block_exits->nodes[index - 1]; @@ -7706,27 +7018,19 @@ pm_loop_modifier_block_exits(pm_parser_t *parser, pm_statements_node_t *statemen */ static pm_until_node_t * pm_until_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *do_keyword, const pm_token_t *closing, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { - pm_until_node_t *node = PM_NODE_ALLOC(parser, pm_until_node_t); pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - *node = (pm_until_node_t) { - { - .type = PM_UNTIL_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = closing->end, - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .predicate = predicate, - .statements = statements - }; - - return node; + return pm_until_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_TOKENS(parser, keyword, closing), + TOK2LOC(parser, keyword), + NTOK2LOC(parser, do_keyword), + TOK2LOC(parser, closing), + predicate, + statements + ); } /** @@ -7734,28 +7038,20 @@ pm_until_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_to */ static pm_until_node_t * pm_until_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { - pm_until_node_t *node = PM_NODE_ALLOC(parser, pm_until_node_t); pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); pm_loop_modifier_block_exits(parser, statements); - *node = (pm_until_node_t) { - { - .type = PM_UNTIL_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = statements->base.location.start, - .end = predicate->location.end, - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .do_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .predicate = predicate, - .statements = statements - }; - - return node; + return pm_until_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_NODES(statements, predicate), + TOK2LOC(parser, keyword), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + predicate, + statements + ); } /** @@ -7763,42 +7059,34 @@ pm_until_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm */ static pm_when_node_t * pm_when_node_create(pm_parser_t *parser, const pm_token_t *keyword) { - pm_when_node_t *node = PM_NODE_ALLOC(parser, pm_when_node_t); - - *node = (pm_when_node_t) { - { - .type = PM_WHEN_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = NULL - } - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .statements = NULL, - .then_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .conditions = { 0 } - }; - - return node; + return pm_when_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_TOKEN(parser, keyword), + TOK2LOC(parser, keyword), + ((pm_node_list_t) { 0 }), + ((pm_location_t) { 0 }), + NULL + ); } /** * Append a new condition to a when node. */ static void -pm_when_node_conditions_append(pm_when_node_t *node, pm_node_t *condition) { - node->base.location.end = condition->location.end; - pm_node_list_append(&node->conditions, condition); +pm_when_node_conditions_append(pm_arena_t *arena, pm_when_node_t *node, pm_node_t *condition) { + PM_NODE_LENGTH_SET_NODE(node, condition); + pm_node_list_append(arena, &node->conditions, condition); } /** * Set the location of the then keyword of a when node. */ -static inline void -pm_when_node_then_keyword_loc_set(pm_when_node_t *node, const pm_token_t *then_keyword) { - node->base.location.end = then_keyword->end; - node->then_keyword_loc = PM_LOCATION_TOKEN_VALUE(then_keyword); +static PRISM_INLINE void +pm_when_node_then_keyword_loc_set(const pm_parser_t *parser, pm_when_node_t *node, const pm_token_t *then_keyword) { + PM_NODE_LENGTH_SET_TOKEN(parser, node, then_keyword); + node->then_keyword_loc = TOK2LOC(parser, then_keyword); } /** @@ -7806,8 +7094,8 @@ pm_when_node_then_keyword_loc_set(pm_when_node_t *node, const pm_token_t *then_k */ static void pm_when_node_statements_set(pm_when_node_t *node, pm_statements_node_t *statements) { - if (statements->base.location.end > node->base.location.end) { - node->base.location.end = statements->base.location.end; + if (PM_NODE_END(statements) > PM_NODE_END(node)) { + PM_NODE_LENGTH_SET_NODE(node, statements); } node->statements = statements; @@ -7818,27 +7106,19 @@ pm_when_node_statements_set(pm_when_node_t *node, pm_statements_node_t *statemen */ static pm_while_node_t * pm_while_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_token_t *do_keyword, const pm_token_t *closing, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { - pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); - *node = (pm_while_node_t) { - { - .type = PM_WHILE_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = closing->end - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .do_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(do_keyword), - .closing_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(closing), - .predicate = predicate, - .statements = statements - }; - - return node; + return pm_while_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_TOKENS(parser, keyword, closing), + TOK2LOC(parser, keyword), + NTOK2LOC(parser, do_keyword), + TOK2LOC(parser, closing), + predicate, + statements + ); } /** @@ -7846,28 +7126,20 @@ pm_while_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_to */ static pm_while_node_t * pm_while_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm_node_t *predicate, pm_statements_node_t *statements, pm_node_flags_t flags) { - pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); pm_conditional_predicate(parser, predicate, PM_CONDITIONAL_PREDICATE_TYPE_CONDITIONAL); pm_loop_modifier_block_exits(parser, statements); - *node = (pm_while_node_t) { - { - .type = PM_WHILE_NODE, - .flags = flags, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = statements->base.location.start, - .end = predicate->location.end - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .do_keyword_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .closing_loc = PM_OPTIONAL_LOCATION_NOT_PROVIDED_VALUE, - .predicate = predicate, - .statements = statements - }; - - return node; + return pm_while_node_new( + parser->arena, + ++parser->node_id, + flags, + PM_LOCATION_INIT_NODES(statements, predicate), + TOK2LOC(parser, keyword), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + predicate, + statements + ); } /** @@ -7875,22 +7147,17 @@ pm_while_node_modifier_create(pm_parser_t *parser, const pm_token_t *keyword, pm */ static pm_while_node_t * pm_while_node_synthesized_create(pm_parser_t *parser, pm_node_t *predicate, pm_statements_node_t *statements) { - pm_while_node_t *node = PM_NODE_ALLOC(parser, pm_while_node_t); - - *node = (pm_while_node_t) { - { - .type = PM_WHILE_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = PM_LOCATION_NULL_VALUE(parser) - }, - .keyword_loc = PM_LOCATION_NULL_VALUE(parser), - .do_keyword_loc = PM_LOCATION_NULL_VALUE(parser), - .closing_loc = PM_LOCATION_NULL_VALUE(parser), - .predicate = predicate, - .statements = statements - }; - - return node; + return pm_while_node_new( + parser->arena, + ++parser->node_id, + 0, + PM_LOCATION_INIT_UNSET, + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + ((pm_location_t) { 0 }), + predicate, + statements + ); } /** @@ -7899,31 +7166,22 @@ pm_while_node_synthesized_create(pm_parser_t *parser, pm_node_t *predicate, pm_s */ static pm_x_string_node_t * pm_xstring_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing, const pm_string_t *unescaped) { - pm_x_string_node_t *node = PM_NODE_ALLOC(parser, pm_x_string_node_t); - - *node = (pm_x_string_node_t) { - { - .type = PM_X_STRING_NODE, - .flags = PM_STRING_FLAGS_FROZEN, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = opening->start, - .end = closing->end - }, - }, - .opening_loc = PM_LOCATION_TOKEN_VALUE(opening), - .content_loc = PM_LOCATION_TOKEN_VALUE(content), - .closing_loc = PM_LOCATION_TOKEN_VALUE(closing), - .unescaped = *unescaped - }; - - return node; + return pm_x_string_node_new( + parser->arena, + ++parser->node_id, + PM_STRING_FLAGS_FROZEN, + PM_LOCATION_INIT_TOKENS(parser, opening, closing), + TOK2LOC(parser, opening), + TOK2LOC(parser, content), + TOK2LOC(parser, closing), + *unescaped + ); } /** * Allocate and initialize a new XStringNode node. */ -static inline pm_x_string_node_t * +static PRISM_INLINE pm_x_string_node_t * pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *content, const pm_token_t *closing) { return pm_xstring_node_create_unescaped(parser, opening, content, closing, &PM_STRING_EMPTY); } @@ -7933,40 +7191,31 @@ pm_xstring_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_ */ static pm_yield_node_t * pm_yield_node_create(pm_parser_t *parser, const pm_token_t *keyword, const pm_location_t *lparen_loc, pm_arguments_node_t *arguments, const pm_location_t *rparen_loc) { - pm_yield_node_t *node = PM_NODE_ALLOC(parser, pm_yield_node_t); + uint32_t start = PM_TOKEN_START(parser, keyword); + uint32_t end; - const uint8_t *end; - if (rparen_loc->start != NULL) { - end = rparen_loc->end; + if (rparen_loc->length > 0) { + end = PM_LOCATION_END(rparen_loc); } else if (arguments != NULL) { - end = arguments->base.location.end; - } else if (lparen_loc->start != NULL) { - end = lparen_loc->end; + end = PM_NODE_END(arguments); + } else if (lparen_loc->length > 0) { + end = PM_LOCATION_END(lparen_loc); } else { - end = keyword->end; - } - - *node = (pm_yield_node_t) { - { - .type = PM_YIELD_NODE, - .node_id = PM_NODE_IDENTIFY(parser), - .location = { - .start = keyword->start, - .end = end - }, - }, - .keyword_loc = PM_LOCATION_TOKEN_VALUE(keyword), - .lparen_loc = *lparen_loc, - .arguments = arguments, - .rparen_loc = *rparen_loc - }; - - return node; + end = PM_TOKEN_END(parser, keyword); + } + + return pm_yield_node_new( + parser->arena, + ++parser->node_id, + 0, + ((pm_location_t) { .start = start, .length = U32(end - start) }), + TOK2LOC(parser, keyword), + *lparen_loc, + arguments, + *rparen_loc + ); } -#undef PM_NODE_ALLOC -#undef PM_NODE_IDENTIFY - /** * Check if any of the currently visible scopes contain a local variable * described by the given constant id. @@ -7992,7 +7241,7 @@ pm_parser_local_depth_constant_id(pm_parser_t *parser, pm_constant_id_t constant * described by the given token. This function implicitly inserts a constant * into the constant pool. */ -static inline int +static PRISM_INLINE int pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) { return pm_parser_local_depth_constant_id(parser, pm_parser_constant_id_token(parser, token)); } @@ -8000,27 +7249,35 @@ pm_parser_local_depth(pm_parser_t *parser, pm_token_t *token) { /** * Add a constant id to the local table of the current scope. */ -static inline void +static PRISM_INLINE void pm_parser_local_add(pm_parser_t *parser, pm_constant_id_t constant_id, const uint8_t *start, const uint8_t *end, uint32_t reads) { - pm_locals_write(&parser->current_scope->locals, constant_id, start, end, reads); + pm_locals_write(&parser->current_scope->locals, constant_id, U32(start - parser->start), U32(end - start), reads); } /** * Add a local variable from a location to the current scope. */ static pm_constant_id_t -pm_parser_local_add_location(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, uint32_t reads) { - pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, start, end); +pm_parser_local_add_raw(pm_parser_t *parser, const uint8_t *start, const uint8_t *end, uint32_t reads) { + pm_constant_id_t constant_id = pm_parser_constant_id_raw(parser, start, end); if (constant_id != 0) pm_parser_local_add(parser, constant_id, start, end, reads); return constant_id; } /** + * Add a local variable from a location to the current scope. + */ +static PRISM_INLINE pm_constant_id_t +pm_parser_local_add_location(pm_parser_t *parser, pm_location_t *location, uint32_t reads) { + return pm_parser_local_add_raw(parser, parser->start + location->start, parser->start + location->start + location->length, reads); +} + +/** * Add a local variable from a token to the current scope. */ -static inline pm_constant_id_t +static PRISM_INLINE pm_constant_id_t pm_parser_local_add_token(pm_parser_t *parser, pm_token_t *token, uint32_t reads) { - return pm_parser_local_add_location(parser, token->start, token->end, reads); + return pm_parser_local_add_raw(parser, token->start, token->end, reads); } /** @@ -8054,7 +7311,7 @@ static bool pm_parser_parameter_name_check(pm_parser_t *parser, const pm_token_t *name) { // We want to check whether the parameter name is a numbered parameter or // not. - pm_refute_numbered_parameter(parser, name->start, name->end); + pm_refute_numbered_parameter(parser, PM_TOKEN_START(parser, name), PM_TOKEN_LENGTH(name)); // Otherwise we'll fetch the constant id for the parameter name and check // whether it's already in the current scope. @@ -8078,8 +7335,7 @@ pm_parser_scope_pop(pm_parser_t *parser) { pm_scope_t *scope = parser->current_scope; parser->current_scope = scope->previous; pm_locals_free(&scope->locals); - pm_node_list_free(&scope->implicit_parameters); - xfree(scope); + xfree_sized(scope, sizeof(pm_scope_t)); } /******************************************************************************/ @@ -8089,7 +7345,7 @@ pm_parser_scope_pop(pm_parser_t *parser) { /** * Pushes a value onto the stack. */ -static inline void +static PRISM_INLINE void pm_state_stack_push(pm_state_stack_t *stack, bool value) { *stack = (*stack << 1) | (value & 1); } @@ -8097,7 +7353,7 @@ pm_state_stack_push(pm_state_stack_t *stack, bool value) { /** * Pops a value off the stack. */ -static inline void +static PRISM_INLINE void pm_state_stack_pop(pm_state_stack_t *stack) { *stack >>= 1; } @@ -8105,38 +7361,38 @@ pm_state_stack_pop(pm_state_stack_t *stack) { /** * Returns the value at the top of the stack. */ -static inline bool +static PRISM_INLINE bool pm_state_stack_p(const pm_state_stack_t *stack) { return *stack & 1; } -static inline void +static PRISM_INLINE void pm_accepts_block_stack_push(pm_parser_t *parser, bool value) { // Use the negation of the value to prevent stack overflow. pm_state_stack_push(&parser->accepts_block_stack, !value); } -static inline void +static PRISM_INLINE void pm_accepts_block_stack_pop(pm_parser_t *parser) { pm_state_stack_pop(&parser->accepts_block_stack); } -static inline bool +static PRISM_INLINE bool pm_accepts_block_stack_p(pm_parser_t *parser) { return !pm_state_stack_p(&parser->accepts_block_stack); } -static inline void +static PRISM_INLINE void pm_do_loop_stack_push(pm_parser_t *parser, bool value) { pm_state_stack_push(&parser->do_loop_stack, value); } -static inline void +static PRISM_INLINE void pm_do_loop_stack_pop(pm_parser_t *parser) { pm_state_stack_pop(&parser->do_loop_stack); } -static inline bool +static PRISM_INLINE bool pm_do_loop_stack_p(pm_parser_t *parser) { return pm_state_stack_p(&parser->do_loop_stack); } @@ -8149,7 +7405,7 @@ pm_do_loop_stack_p(pm_parser_t *parser) { * Get the next character in the source starting from +cursor+. If that position * is beyond the end of the source then return '\0'. */ -static inline uint8_t +static PRISM_INLINE uint8_t peek_at(const pm_parser_t *parser, const uint8_t *cursor) { if (cursor < parser->end) { return *cursor; @@ -8163,7 +7419,7 @@ peek_at(const pm_parser_t *parser, const uint8_t *cursor) { * adding the given offset. If that position is beyond the end of the source * then return '\0'. */ -static inline uint8_t +static PRISM_INLINE uint8_t peek_offset(pm_parser_t *parser, ptrdiff_t offset) { return peek_at(parser, parser->current.end + offset); } @@ -8172,7 +7428,7 @@ peek_offset(pm_parser_t *parser, ptrdiff_t offset) { * Get the next character in the source starting from parser->current.end. If * that position is beyond the end of the source then return '\0'. */ -static inline uint8_t +static PRISM_INLINE uint8_t peek(const pm_parser_t *parser) { return peek_at(parser, parser->current.end); } @@ -8181,7 +7437,7 @@ peek(const pm_parser_t *parser) { * If the character to be read matches the given value, then returns true and * advances the current pointer. */ -static inline bool +static PRISM_INLINE bool match(pm_parser_t *parser, uint8_t value) { if (peek(parser) == value) { parser->current.end++; @@ -8194,7 +7450,7 @@ match(pm_parser_t *parser, uint8_t value) { * Return the length of the line ending string starting at +cursor+, or 0 if it * is not a line ending. This function is intended to be CRLF/LF agnostic. */ -static inline size_t +static PRISM_INLINE size_t match_eol_at(pm_parser_t *parser, const uint8_t *cursor) { if (peek_at(parser, cursor) == '\n') { return 1; @@ -8210,7 +7466,7 @@ match_eol_at(pm_parser_t *parser, const uint8_t *cursor) { * `parser->current.end + offset`, or 0 if it is not a line ending. This * function is intended to be CRLF/LF agnostic. */ -static inline size_t +static PRISM_INLINE size_t match_eol_offset(pm_parser_t *parser, ptrdiff_t offset) { return match_eol_at(parser, parser->current.end + offset); } @@ -8220,7 +7476,7 @@ match_eol_offset(pm_parser_t *parser, ptrdiff_t offset) { * or 0 if it is not a line ending. This function is intended to be CRLF/LF * agnostic. */ -static inline size_t +static PRISM_INLINE size_t match_eol(pm_parser_t *parser) { return match_eol_at(parser, parser->current.end); } @@ -8228,7 +7484,7 @@ match_eol(pm_parser_t *parser) { /** * Skip to the next newline character or NUL byte. */ -static inline const uint8_t * +static PRISM_INLINE const uint8_t * next_newline(const uint8_t *cursor, ptrdiff_t length) { assert(length >= 0); @@ -8241,7 +7497,7 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) { /** * This is equivalent to the predicate of warn_balanced in CRuby. */ -static inline bool +static PRISM_INLINE bool ambiguous_operator_p(const pm_parser_t *parser, bool space_seen) { return !lex_state_p(parser, PM_LEX_STATE_CLASS | PM_LEX_STATE_DOT | PM_LEX_STATE_FNAME | PM_LEX_STATE_ENDFN) && space_seen && !pm_char_is_whitespace(peek(parser)); } @@ -8319,7 +7575,7 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser) { // issue because we didn't understand the encoding that the user was // trying to use. In this case we'll keep using the default encoding but // add an error to the parser to indicate an unsuccessful parse. - pm_parser_err(parser, value_start, cursor, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT); + pm_parser_err(parser, U32(value_start - parser->start), U32(cursor - value_start), PM_ERR_INVALID_ENCODING_MAGIC_COMMENT); } } @@ -8344,7 +7600,7 @@ parser_lex_magic_comment_boolean_value(const uint8_t *value_start, uint32_t valu } } -static inline bool +static PRISM_INLINE bool pm_char_is_magic_comment_key_delimiter(const uint8_t b) { return b == '\'' || b == '"' || b == ':' || b == ';'; } @@ -8354,13 +7610,15 @@ pm_char_is_magic_comment_key_delimiter(const uint8_t b) { * found, it returns a pointer to the start of the marker. Otherwise it returns * NULL. */ -static inline const uint8_t * +static PRISM_INLINE const uint8_t * parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor, const uint8_t *end) { - while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor, '-', (size_t) (end - cursor), parser->encoding_changed, parser->encoding)) != NULL) { - if (cursor + 3 <= end && cursor[1] == '*' && cursor[2] == '-') { - return cursor; + // Scan for '*' as the middle character, since it is rarer than '-' in + // typical comments and avoids repeated memchr calls for '-' that hit + // dashes in words like "foo-bar". + while ((cursor + 3 <= end) && (cursor = pm_memchr(cursor + 1, '*', (size_t) (end - cursor - 1), parser->encoding_changed, parser->encoding)) != NULL) { + if (cursor[-1] == '-' && cursor + 1 < end && cursor[1] == '-') { + return cursor - 1; } - cursor++; } return NULL; } @@ -8375,7 +7633,7 @@ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor * It returns true if it consumes the entire comment. Otherwise it returns * false. */ -static inline bool +static PRISM_INLINE bool parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { bool result = true; @@ -8397,11 +7655,24 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { // have a magic comment. return false; } + } else { + // Non-emacs magic comments must contain a colon for `key: value`. + // Reject early if there is no colon to avoid scanning the entire + // comment character-by-character. + if (pm_memchr(start, ':', (size_t) (end - start), parser->encoding_changed, parser->encoding) == NULL) { + return false; + } + + // Advance start past leading whitespace so the main loop begins + // directly at the key, avoiding a redundant whitespace scan. + start += pm_strspn_whitespace(start, end - start); } cursor = start; while (cursor < end) { - while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++; + if (indicator) { + while (cursor < end && (pm_char_is_magic_comment_key_delimiter(*cursor) || pm_char_is_whitespace(*cursor))) cursor++; + } const uint8_t *key_start = cursor; while (cursor < end && (!pm_char_is_magic_comment_key_delimiter(*cursor) && !pm_char_is_whitespace(*cursor))) cursor++; @@ -8429,7 +7700,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { if (*cursor == '\\' && (cursor + 1 < end)) cursor++; } value_end = cursor; - if (*cursor == '"') cursor++; + if (cursor < end && *cursor == '"') cursor++; } else { value_start = cursor; while (cursor < end && *cursor != '"' && *cursor != ';' && !pm_char_is_whitespace(*cursor)) cursor++; @@ -8487,7 +7758,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { case PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID: PM_PARSER_WARN_TOKEN_FORMAT( parser, - parser->current, + &parser->current, PM_WARN_INVALID_MAGIC_COMMENT_VALUE, (int) key_length, (const char *) key_source, @@ -8514,7 +7785,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { case PM_MAGIC_COMMENT_BOOLEAN_VALUE_INVALID: PM_PARSER_WARN_TOKEN_FORMAT( parser, - parser->current, + &parser->current, PM_WARN_INVALID_MAGIC_COMMENT_VALUE, (int) key_length, (const char *) key_source, @@ -8549,7 +7820,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { } else { PM_PARSER_WARN_TOKEN_FORMAT( parser, - parser->current, + &parser->current, PM_WARN_INVALID_MAGIC_COMMENT_VALUE, (int) key_length, (const char *) key_source, @@ -8562,17 +7833,14 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { // When we're done, we want to free the string in case we had to // allocate memory for it. - pm_string_free(&key); + pm_string_cleanup(&key); // Allocate a new magic comment node to append to the parser's list. - pm_magic_comment_t *magic_comment; - if ((magic_comment = (pm_magic_comment_t *) xcalloc(1, sizeof(pm_magic_comment_t))) != NULL) { - magic_comment->key_start = key_start; - magic_comment->value_start = value_start; - magic_comment->key_length = (uint32_t) key_length; - magic_comment->value_length = value_length; - pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment); - } + pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) pm_arena_alloc(&parser->metadata_arena, sizeof(pm_magic_comment_t), PRISM_ALIGNOF(pm_magic_comment_t)); + magic_comment->node.next = NULL; + magic_comment->key = (pm_location_t) { .start = U32(key_start - parser->start), .length = U32(key_length) }; + magic_comment->value = (pm_location_t) { .start = U32(value_start - parser->start), .length = value_length }; + pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment); } return result; @@ -8582,85 +7850,67 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { /* Context manipulations */ /******************************************************************************/ -static bool -context_terminator(pm_context_t context, pm_token_t *token) { - switch (context) { - case PM_CONTEXT_MAIN: - case PM_CONTEXT_DEF_PARAMS: - case PM_CONTEXT_DEFINED: - case PM_CONTEXT_MULTI_TARGET: - case PM_CONTEXT_TERNARY: - case PM_CONTEXT_RESCUE_MODIFIER: - return token->type == PM_TOKEN_EOF; - case PM_CONTEXT_DEFAULT_PARAMS: - return token->type == PM_TOKEN_COMMA || token->type == PM_TOKEN_PARENTHESIS_RIGHT; - case PM_CONTEXT_PREEXE: - case PM_CONTEXT_POSTEXE: - return token->type == PM_TOKEN_BRACE_RIGHT; - case PM_CONTEXT_MODULE: - case PM_CONTEXT_CLASS: - case PM_CONTEXT_SCLASS: - case PM_CONTEXT_LAMBDA_DO_END: - case PM_CONTEXT_DEF: - case PM_CONTEXT_BLOCK_KEYWORDS: - return token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ENSURE; - case PM_CONTEXT_WHILE: - case PM_CONTEXT_UNTIL: - case PM_CONTEXT_ELSE: - case PM_CONTEXT_FOR: - case PM_CONTEXT_BEGIN_ENSURE: - case PM_CONTEXT_BLOCK_ENSURE: - case PM_CONTEXT_CLASS_ENSURE: - case PM_CONTEXT_DEF_ENSURE: - case PM_CONTEXT_LAMBDA_ENSURE: - case PM_CONTEXT_MODULE_ENSURE: - case PM_CONTEXT_SCLASS_ENSURE: - return token->type == PM_TOKEN_KEYWORD_END; - case PM_CONTEXT_LOOP_PREDICATE: - return token->type == PM_TOKEN_KEYWORD_DO || token->type == PM_TOKEN_KEYWORD_THEN; - case PM_CONTEXT_FOR_INDEX: - return token->type == PM_TOKEN_KEYWORD_IN; - case PM_CONTEXT_CASE_WHEN: - return token->type == PM_TOKEN_KEYWORD_WHEN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE; - case PM_CONTEXT_CASE_IN: - return token->type == PM_TOKEN_KEYWORD_IN || token->type == PM_TOKEN_KEYWORD_END || token->type == PM_TOKEN_KEYWORD_ELSE; - case PM_CONTEXT_IF: - case PM_CONTEXT_ELSIF: - return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_ELSIF || token->type == PM_TOKEN_KEYWORD_END; - case PM_CONTEXT_UNLESS: - return token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END; - case PM_CONTEXT_EMBEXPR: - return token->type == PM_TOKEN_EMBEXPR_END; - case PM_CONTEXT_BLOCK_BRACES: - return token->type == PM_TOKEN_BRACE_RIGHT; - case PM_CONTEXT_PARENS: - return token->type == PM_TOKEN_PARENTHESIS_RIGHT; - case PM_CONTEXT_BEGIN: - case PM_CONTEXT_BEGIN_RESCUE: - case PM_CONTEXT_BLOCK_RESCUE: - case PM_CONTEXT_CLASS_RESCUE: - case PM_CONTEXT_DEF_RESCUE: - case PM_CONTEXT_LAMBDA_RESCUE: - case PM_CONTEXT_MODULE_RESCUE: - case PM_CONTEXT_SCLASS_RESCUE: - return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_RESCUE || token->type == PM_TOKEN_KEYWORD_ELSE || token->type == PM_TOKEN_KEYWORD_END; - case PM_CONTEXT_BEGIN_ELSE: - case PM_CONTEXT_BLOCK_ELSE: - case PM_CONTEXT_CLASS_ELSE: - case PM_CONTEXT_DEF_ELSE: - case PM_CONTEXT_LAMBDA_ELSE: - case PM_CONTEXT_MODULE_ELSE: - case PM_CONTEXT_SCLASS_ELSE: - return token->type == PM_TOKEN_KEYWORD_ENSURE || token->type == PM_TOKEN_KEYWORD_END; - case PM_CONTEXT_LAMBDA_BRACES: - return token->type == PM_TOKEN_BRACE_RIGHT; - case PM_CONTEXT_PREDICATE: - return token->type == PM_TOKEN_KEYWORD_THEN || token->type == PM_TOKEN_NEWLINE || token->type == PM_TOKEN_SEMICOLON; - case PM_CONTEXT_NONE: - return false; - } +static const uint32_t context_terminators[] = { + [PM_CONTEXT_NONE] = 0, + [PM_CONTEXT_BEGIN] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BEGIN_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_BRACES] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_BLOCK_KEYWORDS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_BLOCK_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_BLOCK_PARAMETERS] = (1U << PM_TOKEN_PIPE), + [PM_CONTEXT_BLOCK_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CASE_WHEN] = (1U << PM_TOKEN_KEYWORD_WHEN) | (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_ELSE), + [PM_CONTEXT_CASE_IN] = (1U << PM_TOKEN_KEYWORD_IN) | (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_ELSE), + [PM_CONTEXT_CLASS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_CLASS_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CLASS_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_CLASS_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_DEF_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_DEF_PARAMS] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_DEFINED] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_DEFAULT_PARAMS] = (1U << PM_TOKEN_COMMA) | (1U << PM_TOKEN_PARENTHESIS_RIGHT), + [PM_CONTEXT_ELSE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_ELSIF] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_ELSIF) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_EMBEXPR] = (1U << PM_TOKEN_EMBEXPR_END), + [PM_CONTEXT_FOR] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_FOR_INDEX] = (1U << PM_TOKEN_KEYWORD_IN), + [PM_CONTEXT_IF] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_ELSIF) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_BRACES] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_LAMBDA_DO_END] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_LAMBDA_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LAMBDA_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_LOOP_PREDICATE] = (1U << PM_TOKEN_KEYWORD_DO) | (1U << PM_TOKEN_KEYWORD_THEN), + [PM_CONTEXT_MAIN] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_MODULE] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_MODULE_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MODULE_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MODULE_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_MULTI_TARGET] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_PARENS] = (1U << PM_TOKEN_PARENTHESIS_RIGHT), + [PM_CONTEXT_POSTEXE] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_PREDICATE] = (1U << PM_TOKEN_KEYWORD_THEN) | (1U << PM_TOKEN_NEWLINE) | (1U << PM_TOKEN_SEMICOLON), + [PM_CONTEXT_PREEXE] = (1U << PM_TOKEN_BRACE_RIGHT), + [PM_CONTEXT_RESCUE_MODIFIER] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_SCLASS] = (1U << PM_TOKEN_KEYWORD_END) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ENSURE), + [PM_CONTEXT_SCLASS_ENSURE] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_SCLASS_ELSE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_SCLASS_RESCUE] = (1U << PM_TOKEN_KEYWORD_ENSURE) | (1U << PM_TOKEN_KEYWORD_RESCUE) | (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_TERNARY] = (1U << PM_TOKEN_EOF), + [PM_CONTEXT_UNLESS] = (1U << PM_TOKEN_KEYWORD_ELSE) | (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_UNTIL] = (1U << PM_TOKEN_KEYWORD_END), + [PM_CONTEXT_WHILE] = (1U << PM_TOKEN_KEYWORD_END), +}; - return false; +static PRISM_INLINE bool +context_terminator(pm_context_t context, pm_token_t *token) { + return token->type < 32 && (context_terminators[context] & (1U << token->type)); } /** @@ -8699,7 +7949,7 @@ context_push(pm_parser_t *parser, pm_context_t context) { static void context_pop(pm_parser_t *parser) { pm_context_node_t *prev = parser->current_context->prev; - xfree(parser->current_context); + xfree_sized(parser->current_context, sizeof(pm_context_node_t)); parser->current_context = prev; } @@ -8761,6 +8011,7 @@ context_human(pm_context_t context) { case PM_CONTEXT_BEGIN: return "begin statement"; case PM_CONTEXT_BLOCK_BRACES: return "'{'..'}' block"; case PM_CONTEXT_BLOCK_KEYWORDS: return "'do'..'end' block"; + case PM_CONTEXT_BLOCK_PARAMETERS: return "'|'..'|' block parameter"; case PM_CONTEXT_CASE_WHEN: return "'when' clause"; case PM_CONTEXT_CASE_IN: return "'in' clause"; case PM_CONTEXT_CLASS: return "class definition"; @@ -8821,11 +8072,11 @@ context_human(pm_context_t context) { /* Specific token lexers */ /******************************************************************************/ -static inline void +static PRISM_INLINE void pm_strspn_number_validate(pm_parser_t *parser, const uint8_t *string, size_t length, const uint8_t *invalid) { if (invalid != NULL) { pm_diagnostic_id_t diag_id = (invalid == (string + length - 1)) ? PM_ERR_INVALID_NUMBER_UNDERSCORE_TRAILING : PM_ERR_INVALID_NUMBER_UNDERSCORE_INNER; - pm_parser_err(parser, invalid, invalid + 1, diag_id); + pm_parser_err(parser, U32(invalid - parser->start), 1, diag_id); } } @@ -8936,7 +8187,7 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_BINARY); } - parser->integer_base = PM_INTEGER_BASE_FLAGS_BINARY; + parser->integer.base = PM_INTEGER_BASE_FLAGS_BINARY; break; // 0o1111 is an octal number @@ -8950,7 +8201,7 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_OCTAL); } - parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL; + parser->integer.base = PM_INTEGER_BASE_FLAGS_OCTAL; break; // 01111 is an octal number @@ -8964,7 +8215,7 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { case '6': case '7': parser->current.end += pm_strspn_octal_number_validate(parser, parser->current.end); - parser->integer_base = PM_INTEGER_BASE_FLAGS_OCTAL; + parser->integer.base = PM_INTEGER_BASE_FLAGS_OCTAL; break; // 0x1111 is a hexadecimal number @@ -8978,7 +8229,7 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { pm_parser_err_current(parser, PM_ERR_INVALID_NUMBER_HEXADECIMAL); } - parser->integer_base = PM_INTEGER_BASE_FLAGS_HEXADECIMAL; + parser->integer.base = PM_INTEGER_BASE_FLAGS_HEXADECIMAL; break; // 0.xxx is a float @@ -8996,11 +8247,62 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { } } else { // If it didn't start with a 0, then we'll lex as far as we can into a - // decimal number. - parser->current.end += pm_strspn_decimal_number_validate(parser, parser->current.end); + // decimal number. We compute the integer value inline to avoid + // re-scanning the digits later in pm_integer_parse. + { + const uint8_t *cursor = parser->current.end; + const uint8_t *end = parser->end; + uint64_t value = (uint64_t) (cursor[-1] - '0'); + + bool has_underscore = false; + bool prev_underscore = false; + const uint8_t *invalid = NULL; + + while (cursor < end) { + uint8_t c = *cursor; + if (c >= '0' && c <= '9') { + if (value <= UINT32_MAX) value = value * 10 + (uint64_t) (c - '0'); + prev_underscore = false; + cursor++; + } else if (c == '_') { + has_underscore = true; + if (prev_underscore && invalid == NULL) invalid = cursor; + prev_underscore = true; + cursor++; + } else { + break; + } + } + + if (has_underscore) { + if (prev_underscore && invalid == NULL) invalid = cursor - 1; + pm_strspn_number_validate(parser, parser->current.end, (size_t) (cursor - parser->current.end), invalid); + } + + if (value <= UINT32_MAX) { + parser->integer.value = (uint32_t) value; + parser->integer.lexed = true; + } + + parser->current.end = cursor; + } // Afterward, we'll lex as far as we can into an optional float suffix. - type = lex_optional_float_suffix(parser, seen_e); + // Guard the function call: the vast majority of decimal numbers are + // plain integers, so avoid the call when the next byte cannot start a + // float suffix. + { + uint8_t next = peek(parser); + if (next == '.' || next == 'e' || next == 'E') { + type = lex_optional_float_suffix(parser, seen_e); + + // If it turned out to be a float, the cached integer value is + // invalid. + if (type != PM_TOKEN_INTEGER) { + parser->integer.lexed = false; + } + } + } } // At this point we have a completed number, but we want to provide the user @@ -9010,7 +8312,7 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { const uint8_t *fraction_start = parser->current.end; const uint8_t *fraction_end = parser->current.end + 2; fraction_end += pm_strspn_decimal_digit(fraction_end, parser->end - fraction_end); - pm_parser_err(parser, fraction_start, fraction_end, PM_ERR_INVALID_NUMBER_FRACTION); + pm_parser_err(parser, U32(fraction_start - parser->start), U32(fraction_end - fraction_start), PM_ERR_INVALID_NUMBER_FRACTION); } return type; @@ -9019,7 +8321,8 @@ lex_numeric_prefix(pm_parser_t *parser, bool* seen_e) { static pm_token_type_t lex_numeric(pm_parser_t *parser) { pm_token_type_t type = PM_TOKEN_INTEGER; - parser->integer_base = PM_INTEGER_BASE_FLAGS_DECIMAL; + parser->integer.base = PM_INTEGER_BASE_FLAGS_DECIMAL; + parser->integer.lexed = false; if (parser->current.end < parser->end) { bool seen_e = false; @@ -9109,8 +8412,8 @@ lex_global_variable(pm_parser_t *parser) { } while ((width = char_is_identifier(parser, parser->current.end, parser->end - parser->current.end)) > 0); // $0 isn't allowed to be followed by anything. - pm_diagnostic_id_t diag_id = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->current, diag_id); + pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &parser->current, diag_id); } return PM_TOKEN_GLOBAL_VARIABLE; @@ -9146,9 +8449,9 @@ lex_global_variable(pm_parser_t *parser) { } else { // If we get here, then we have a $ followed by something that // isn't recognized as a global variable. - pm_diagnostic_id_t diag_id = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; - const uint8_t *end = parser->current.end + parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - PM_PARSER_ERR_FORMAT(parser, parser->current.start, end, diag_id, (int) (end - parser->current.start), (const char *) parser->current.start); + pm_diagnostic_id_t diag_id = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? PM_ERR_INVALID_VARIABLE_GLOBAL_3_3 : PM_ERR_INVALID_VARIABLE_GLOBAL; + size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); + PM_PARSER_ERR_FORMAT(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), diag_id, (int) (PM_TOKEN_LENGTH(&parser->current) + U32(width)), (const char *) parser->current.start); } return PM_TOKEN_GLOBAL_VARIABLE; @@ -9168,7 +8471,7 @@ lex_global_variable(pm_parser_t *parser) { * * `type` - the expected token type * * `modifier_type` - the expected modifier token type */ -static inline pm_token_type_t +static PRISM_INLINE pm_token_type_t lex_keyword(pm_parser_t *parser, const uint8_t *current_start, const char *value, size_t vlen, pm_lex_state_t state, pm_token_type_t type, pm_token_type_t modifier_type) { if (memcmp(current_start, value, vlen) == 0) { pm_lex_state_t last_state = parser->lex_state; @@ -9207,6 +8510,10 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { current_end += width; } } else { + // Fast path: scan ASCII identifier bytes using wide operations. + current_end += scan_identifier_ascii(current_end, end); + + // Byte-at-a-time fallback for the tail and any UTF-8 sequences. while ((width = char_is_identifier_utf8(current_end, end - current_end)) > 0) { current_end += width; } @@ -9266,9 +8573,15 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { switch (width) { case 2: if (lex_keyword(parser, current_start, "do", width, PM_LEX_STATE_BEG, PM_TOKEN_KEYWORD_DO, PM_TOKEN_EOF) != PM_TOKEN_EOF) { + if (parser->enclosure_nesting == parser->lambda_enclosure_nesting) { + return PM_TOKEN_KEYWORD_DO; + } if (pm_do_loop_stack_p(parser)) { return PM_TOKEN_KEYWORD_DO_LOOP; } + if (!pm_accepts_block_stack_p(parser)) { + return PM_TOKEN_KEYWORD_DO_BLOCK; + } return PM_TOKEN_KEYWORD_DO; } @@ -9347,8 +8660,8 @@ current_token_starts_line(pm_parser_t *parser) { * handle interpolation. This function performs that check. It returns a token * type representing what it found. Those cases are: * - * * PM_TOKEN_NOT_PROVIDED - No interpolation was found at this point. The - * caller should keep lexing. + * * 0 - No interpolation was found at this point. The caller should keep + * lexing. * * PM_TOKEN_STRING_CONTENT - No interpolation was found at this point. The * caller should return this token type. * * PM_TOKEN_EMBEXPR_BEGIN - An embedded expression was found. The caller @@ -9365,9 +8678,9 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { return PM_TOKEN_STRING_CONTENT; } - // Now we'll check against the character that follows the #. If it constitutes - // valid interplation, we'll handle that, otherwise we'll return - // PM_TOKEN_NOT_PROVIDED. + // Now we'll check against the character that follows the #. If it + // constitutes valid interplation, we'll handle that, otherwise we'll return + // 0. switch (pound[1]) { case '@': { // In this case we may have hit an embedded instance or class variable. @@ -9401,7 +8714,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { // string content. This is like if we get "#@-". In this case the caller // should keep lexing. parser->current.end = pound + 1; - return PM_TOKEN_NOT_PROVIDED; + return 0; } case '$': // In this case we may have hit an embedded global variable. If there's @@ -9451,7 +8764,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { // In this case we've hit a #$ that does not indicate a global variable. // In this case we'll continue lexing past it. parser->current.end = pound + 1; - return PM_TOKEN_NOT_PROVIDED; + return 0; case '{': // In this case it's the start of an embedded expression. If we have // already consumed content, then we need to return that content as string @@ -9475,7 +8788,7 @@ lex_interpolation(pm_parser_t *parser, const uint8_t *pound) { // mark that by returning the not provided token type. This tells the // consumer to keep lexing forward. parser->current.end = pound + 1; - return PM_TOKEN_NOT_PROVIDED; + return 0; } } @@ -9499,7 +8812,7 @@ static const bool ascii_printable_chars[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; -static inline bool +static PRISM_INLINE bool char_is_ascii_printable(const uint8_t b) { return (b < 0x80) && ascii_printable_chars[b]; } @@ -9508,7 +8821,7 @@ char_is_ascii_printable(const uint8_t b) { * Return the value that a hexadecimal digit character represents. For example, * transform 'a' into 10, 'b' into 11, etc. */ -static inline uint8_t +static PRISM_INLINE uint8_t escape_hexadecimal_digit(const uint8_t value) { return (uint8_t) ((value <= '9') ? (value - '0') : (value & 0x7) + 9); } @@ -9518,8 +8831,8 @@ escape_hexadecimal_digit(const uint8_t value) { * digits scanned. This function assumes that the characters have already been * validated. */ -static inline uint32_t -escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length) { +static PRISM_INLINE uint32_t +escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length, const pm_location_t *error_location, const uint8_t flags) { uint32_t value = 0; for (size_t index = 0; index < length; index++) { if (index != 0) value <<= 4; @@ -9529,7 +8842,14 @@ escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length) { // Here we're going to verify that the value is actually a valid Unicode // codepoint and not a surrogate pair. if (value >= 0xD800 && value <= 0xDFFF) { - pm_parser_err(parser, string, string + length, PM_ERR_ESCAPE_INVALID_UNICODE); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, defer the error to regexp encoding + // validation where we can produce a regexp-specific message. + } else if (error_location != NULL) { + pm_parser_err(parser, error_location->start, error_location->length, PM_ERR_ESCAPE_INVALID_UNICODE); + } else { + pm_parser_err(parser, U32(string - parser->start), U32(length), PM_ERR_ESCAPE_INVALID_UNICODE); + } return 0xFFFD; } @@ -9539,7 +8859,7 @@ escape_unicode(pm_parser_t *parser, const uint8_t *string, size_t length) { /** * Escape a single character value based on the given flags. */ -static inline uint8_t +static PRISM_INLINE uint8_t escape_byte(uint8_t value, const uint8_t flags) { if (flags & PM_ESCAPE_FLAG_CONTROL) value &= 0x9f; if (flags & PM_ESCAPE_FLAG_META) value |= 0x80; @@ -9549,21 +8869,32 @@ escape_byte(uint8_t value, const uint8_t flags) { /** * Write a unicode codepoint to the given buffer. */ -static inline void +static PRISM_INLINE void escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, const uint8_t *start, const uint8_t *end, uint32_t value) { // \u escape sequences in string-like structures implicitly change the // encoding to UTF-8 if they are >= 0x80 or if they are used in a character // literal. if (value >= 0x80 || flags & PM_ESCAPE_FLAG_SINGLE) { if (parser->explicit_encoding != NULL && parser->explicit_encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_FORMAT(parser, start, end, PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, suppress this error — the regexp encoding + // validation will produce a more specific error message. + } else { + PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(end - start), PM_ERR_MIXED_ENCODING, parser->explicit_encoding->name); + } } parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; } if (!pm_buffer_append_unicode_codepoint(buffer, value)) { - pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, defer the error to the regexp encoding + // validation which produces a regexp-specific message. + } else { + pm_parser_err(parser, U32(start - parser->start), U32(end - start), PM_ERR_ESCAPE_INVALID_UNICODE); + } + pm_buffer_append_byte(buffer, 0xEF); pm_buffer_append_byte(buffer, 0xBF); pm_buffer_append_byte(buffer, 0xBD); @@ -9574,11 +8905,16 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla * When you're writing a byte to the unescape buffer, if the byte is non-ASCII * (i.e., the top bit is set) then it locks in the encoding. */ -static inline void -escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte) { +static PRISM_INLINE void +escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t flags, uint8_t byte) { if (byte >= 0x80) { if (parser->explicit_encoding != NULL && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // In regexp context, suppress this error — the regexp encoding + // validation will produce a more specific error message. + } else { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } } parser->explicit_encoding = parser->encoding; @@ -9602,19 +8938,19 @@ escape_write_byte_encoded(pm_parser_t *parser, pm_buffer_t *buffer, uint8_t byte * Note that in this case there is a literal \ byte in the regular expression * source so that the regular expression engine will perform its own unescaping. */ -static inline void +static PRISM_INLINE void escape_write_byte(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags, uint8_t byte) { if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_format(regular_expression_buffer, "\\x%02X", byte); } - escape_write_byte_encoded(parser, buffer, byte); + escape_write_byte_encoded(parser, buffer, flags, byte); } /** * Write each byte of the given escaped character into the buffer. */ -static inline void +static PRISM_INLINE void escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expression_buffer, uint8_t flags) { size_t width; if (parser->encoding_changed) { @@ -9624,6 +8960,7 @@ escape_write_escape_encoded(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_ } if (width == 1) { + if (*parser->current.end == '\n') pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(*parser->current.end++, flags)); } else if (width > 1) { // Valid multibyte character. Just ignore escape. @@ -9649,7 +8986,7 @@ escape_read_warn(pm_parser_t *parser, uint8_t flags, uint8_t flag, const char *t PM_PARSER_WARN_TOKEN_FORMAT( parser, - parser->current, + &parser->current, PM_WARN_INVALID_CHARACTER, FLAG(flags), FLAG(flag), @@ -9764,7 +9101,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre } } - escape_write_byte_encoded(parser, buffer, value); + escape_write_byte_encoded(parser, buffer, flags, value); } else { pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_HEXADECIMAL); } @@ -9777,7 +9114,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (parser->current.end == parser->end) { const uint8_t *start = parser->current.end - 2; - PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(parser->current.end - start), PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); } else if (peek(parser) == '{') { const uint8_t *unicode_codepoints_start = parser->current.end - 2; parser->current.end++; @@ -9806,18 +9143,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (hexadecimal_length > 6) { // \u{nnnn} character literal allows only 1-6 hexadecimal digits - pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG); + pm_parser_err(parser, U32(unicode_start - parser->start), U32(hexadecimal_length), PM_ERR_ESCAPE_INVALID_UNICODE_LONG); } else if (hexadecimal_length == 0) { // there are not hexadecimal characters if (flags & PM_ESCAPE_FLAG_REGEXP) { // If this is a regular expression, we are going to // let the regular expression engine handle this - // error instead of us. + // error instead of us because we don't know at this + // point if we're inside a comment in /x mode. pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } else { - pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE); - pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + pm_parser_err(parser, PM_TOKEN_END(parser, &parser->current), 0, PM_ERR_ESCAPE_INVALID_UNICODE); + pm_parser_err(parser, PM_TOKEN_END(parser, &parser->current), 0, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); } return; @@ -9829,7 +9167,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre extra_codepoints_start = unicode_start; } - uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length); + uint32_t value = escape_unicode(parser, unicode_start, hexadecimal_length, NULL, flags); escape_write_unicode(parser, buffer, flags, unicode_start, parser->current.end, value); parser->current.end += pm_strspn_inline_whitespace(parser->current.end, parser->end - parser->current.end); @@ -9838,21 +9176,22 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre // ?\u{nnnn} character literal should contain only one codepoint // and cannot be like ?\u{nnnn mmmm}. if (flags & PM_ESCAPE_FLAG_SINGLE && codepoints_count > 1) { - pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); + pm_parser_err(parser, U32(extra_codepoints_start - parser->start), U32(parser->current.end - 1 - extra_codepoints_start), PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); } if (parser->current.end == parser->end) { - PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start); + PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(parser->current.end - start), PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start); } else if (peek(parser) == '}') { parser->current.end++; } else { if (flags & PM_ESCAPE_FLAG_REGEXP) { // If this is a regular expression, we are going to let // the regular expression engine handle this error - // instead of us. + // instead of us because we don't know at this point if + // we're inside a comment in /x mode. pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } else { - pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + pm_parser_err(parser, U32(unicode_codepoints_start - parser->start), U32(parser->current.end - unicode_codepoints_start), PM_ERR_ESCAPE_INVALID_UNICODE_TERM); } } @@ -9867,10 +9206,10 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); } else { const uint8_t *start = parser->current.end - 2; - PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + PM_PARSER_ERR_FORMAT(parser, U32(start - parser->start), U32(parser->current.end - start), PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); } } else if (length == 4) { - uint32_t value = escape_unicode(parser, parser->current.end, 4); + uint32_t value = escape_unicode(parser, parser->current.end, 4, NULL, flags); if (flags & PM_ESCAPE_FLAG_REGEXP) { pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end + 4 - start)); @@ -9916,7 +9255,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre parser->current.end++; if (match(parser, 'u') || match(parser, 'U')) { - pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current), PM_ERR_INVALID_ESCAPE_CHARACTER); return; } @@ -9938,6 +9277,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre return; } + if (peeked == '\n') pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); parser->current.end++; escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; @@ -9952,7 +9292,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (peek(parser) != '-') { size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_CONTROL); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), PM_ERR_ESCAPE_INVALID_CONTROL); return; } @@ -9973,7 +9313,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre parser->current.end++; if (match(parser, 'u') || match(parser, 'U')) { - pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current), PM_ERR_INVALID_ESCAPE_CHARACTER); return; } @@ -9992,10 +9332,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre default: { if (!char_is_ascii_printable(peeked)) { size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_CONTROL); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), PM_ERR_ESCAPE_INVALID_CONTROL); return; } + if (peeked == '\n') pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); parser->current.end++; escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_CONTROL)); return; @@ -10010,7 +9351,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre if (peek(parser) != '-') { size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), PM_ERR_ESCAPE_INVALID_META); return; } @@ -10026,7 +9367,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre parser->current.end++; if (match(parser, 'u') || match(parser, 'U')) { - pm_parser_err(parser, parser->current.start, parser->current.end, PM_ERR_INVALID_ESCAPE_CHARACTER); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current), PM_ERR_INVALID_ESCAPE_CHARACTER); return; } @@ -10045,10 +9386,11 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre default: if (!char_is_ascii_printable(peeked)) { size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), PM_ERR_ESCAPE_INVALID_META); return; } + if (peeked == '\n') pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); parser->current.end++; escape_write_byte(parser, buffer, regular_expression_buffer, flags, escape_byte(peeked, flags | PM_ESCAPE_FLAG_META)); return; @@ -10056,8 +9398,9 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre } case '\r': { if (peek_offset(parser, 1) == '\n') { + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 2); parser->current.end += 2; - escape_write_byte_encoded(parser, buffer, escape_byte('\n', flags)); + escape_write_byte_encoded(parser, buffer, flags, escape_byte('\n', flags)); return; } PRISM_FALLTHROUGH @@ -10065,7 +9408,7 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre default: { if ((flags & (PM_ESCAPE_FLAG_CONTROL | PM_ESCAPE_FLAG_META)) && !char_is_ascii_printable(peeked)) { size_t width = parser->encoding->char_width(parser->current.end, parser->end - parser->current.end); - pm_parser_err(parser, parser->current.start, parser->current.end + width, PM_ERR_ESCAPE_INVALID_META); + pm_parser_err(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current) + U32(width), PM_ERR_ESCAPE_INVALID_META); return; } if (parser->current.end < parser->end) { @@ -10127,10 +9470,14 @@ lex_question_mark(pm_parser_t *parser) { lex_state_set(parser, PM_LEX_STATE_END); pm_buffer_t buffer; - pm_buffer_init_capacity(&buffer, 3); + pm_buffer_init(&buffer, 3); escape_read(parser, &buffer, NULL, PM_ESCAPE_FLAG_SINGLE); - pm_string_owned_init(&parser->current_string, (uint8_t *) buffer.value, buffer.length); + + // Copy buffer data into the arena and free the heap buffer. + void *arena_data = pm_arena_memdup(parser->arena, buffer.value, buffer.length, PRISM_ALIGNOF(uint8_t)); + pm_string_constant_init(&parser->current_string, (const char *) arena_data, buffer.length); + pm_buffer_cleanup(&buffer); return PM_TOKEN_CHARACTER_LITERAL; } else { @@ -10173,12 +9520,12 @@ lex_at_variable(pm_parser_t *parser) { } } else if (parser->current.end < end && pm_char_is_decimal_digit(*parser->current.end)) { pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE; - if (parser->version == PM_OPTIONS_VERSION_CRUBY_3_3) { + if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) { diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_INCOMPLETE_VARIABLE_CLASS_3_3 : PM_ERR_INCOMPLETE_VARIABLE_INSTANCE_3_3; } size_t width = parser->encoding->char_width(parser->current.end, end - parser->current.end); - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, diag_id, (int) ((parser->current.end + width) - parser->current.start), (const char *) parser->current.start); } else { pm_diagnostic_id_t diag_id = (type == PM_TOKEN_CLASS_VARIABLE) ? PM_ERR_CLASS_VARIABLE_BARE : PM_ERR_INSTANCE_VARIABLE_BARE; pm_parser_err_token(parser, &parser->current, diag_id); @@ -10196,24 +9543,23 @@ lex_at_variable(pm_parser_t *parser) { /** * Optionally call out to the lex callback if one is provided. */ -static inline void +static PRISM_INLINE void parser_lex_callback(pm_parser_t *parser) { - if (parser->lex_callback) { - parser->lex_callback->callback(parser->lex_callback->data, parser, &parser->current); + if (parser->lex_callback.callback) { + parser->lex_callback.callback(parser, &parser->current, parser->lex_callback.data); } } /** * Return a new comment node of the specified type. */ -static inline pm_comment_t * +static PRISM_INLINE pm_comment_t * parser_comment(pm_parser_t *parser, pm_comment_type_t type) { - pm_comment_t *comment = (pm_comment_t *) xcalloc(1, sizeof(pm_comment_t)); - if (comment == NULL) return NULL; + pm_comment_t *comment = (pm_comment_t *) pm_arena_alloc(&parser->metadata_arena, sizeof(pm_comment_t), PRISM_ALIGNOF(pm_comment_t)); *comment = (pm_comment_t) { .type = type, - .location = { parser->current.start, parser->current.end } + .location = TOK2LOC(parser, &parser->current) }; return comment; @@ -10232,7 +9578,7 @@ lex_embdoc(pm_parser_t *parser) { if (newline == NULL) { parser->current.end = parser->end; } else { - pm_newline_list_append(&parser->newline_list, newline); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(newline - parser->start + 1)); parser->current.end = newline + 1; } @@ -10240,8 +9586,8 @@ lex_embdoc(pm_parser_t *parser) { parser_lex_callback(parser); // Now, create a comment that is going to be attached to the parser. + const uint8_t *comment_start = parser->current.start; pm_comment_t *comment = parser_comment(parser, PM_COMMENT_EMBDOC); - if (comment == NULL) return PM_TOKEN_EOF; // Now, loop until we find the end of the embedded documentation or the end // of the file. @@ -10265,14 +9611,14 @@ lex_embdoc(pm_parser_t *parser) { if (newline == NULL) { parser->current.end = parser->end; } else { - pm_newline_list_append(&parser->newline_list, newline); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(newline - parser->start + 1)); parser->current.end = newline + 1; } parser->current.type = PM_TOKEN_EMBDOC_END; parser_lex_callback(parser); - comment->location.end = parser->current.end; + comment->location.length = (uint32_t) (parser->current.end - comment_start); pm_list_append(&parser->comment_list, (pm_list_node_t *) comment); return PM_TOKEN_EMBDOC_END; @@ -10285,7 +9631,7 @@ lex_embdoc(pm_parser_t *parser) { if (newline == NULL) { parser->current.end = parser->end; } else { - pm_newline_list_append(&parser->newline_list, newline); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(newline - parser->start + 1)); parser->current.end = newline + 1; } @@ -10295,7 +9641,7 @@ lex_embdoc(pm_parser_t *parser) { pm_parser_err_current(parser, PM_ERR_EMBDOC_TERM); - comment->location.end = parser->current.end; + comment->location.length = (uint32_t) (parser->current.end - comment_start); pm_list_append(&parser->comment_list, (pm_list_node_t *) comment); return PM_TOKEN_EOF; @@ -10306,7 +9652,7 @@ lex_embdoc(pm_parser_t *parser) { * This happens in a couple places depending on whether or not we have already * lexed a comment. */ -static inline void +static PRISM_INLINE void parser_lex_ignored_newline(pm_parser_t *parser) { parser->current.type = PM_TOKEN_IGNORED_NEWLINE; parser_lex_callback(parser); @@ -10321,7 +9667,7 @@ parser_lex_ignored_newline(pm_parser_t *parser) { * If it is set, then we need to skip past the heredoc body and then clear the * heredoc_end field. */ -static inline void +static PRISM_INLINE void parser_flush_heredoc_end(pm_parser_t *parser) { assert(parser->heredoc_end <= parser->end); parser->next_start = parser->heredoc_end; @@ -10397,12 +9743,12 @@ typedef struct { /** * Push the given byte into the token buffer. */ -static inline void +static PRISM_INLINE void pm_token_buffer_push_byte(pm_token_buffer_t *token_buffer, uint8_t byte) { pm_buffer_append_byte(&token_buffer->buffer, byte); } -static inline void +static PRISM_INLINE void pm_regexp_token_buffer_push_byte(pm_regexp_token_buffer_t *token_buffer, uint8_t byte) { pm_buffer_append_byte(&token_buffer->regexp_buffer, byte); } @@ -10410,7 +9756,7 @@ pm_regexp_token_buffer_push_byte(pm_regexp_token_buffer_t *token_buffer, uint8_t /** * Return the width of the character at the end of the current token. */ -static inline size_t +static PRISM_INLINE size_t parser_char_width(const pm_parser_t *parser) { size_t width; if (parser->encoding_changed) { @@ -10437,36 +9783,31 @@ pm_token_buffer_push_escaped(pm_token_buffer_t *token_buffer, pm_parser_t *parse static void pm_regexp_token_buffer_push_escaped(pm_regexp_token_buffer_t *token_buffer, pm_parser_t *parser) { size_t width = parser_char_width(parser); - pm_buffer_append_bytes(&token_buffer->base.buffer, parser->current.end, width); - pm_buffer_append_bytes(&token_buffer->regexp_buffer, parser->current.end, width); + const uint8_t *start = parser->current.end; + pm_buffer_append_bytes(&token_buffer->base.buffer, start, width); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, start, width); parser->current.end += width; } -static bool -pm_slice_ascii_only_p(const uint8_t *value, size_t length) { - for (size_t index = 0; index < length; index++) { - if (value[index] & 0x80) return false; - } - - return true; -} - /** * When we're about to return from lexing the current token and we know for sure * that we have found an escape sequence, this function is called to copy the * contents of the token buffer into the current string on the parser so that it * can be attached to the correct node. */ -static inline void +static PRISM_INLINE void pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { - pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->buffer), pm_buffer_length(&token_buffer->buffer)); + // Copy buffer data into the arena and free the heap buffer. + size_t len = pm_buffer_length(&token_buffer->buffer); + void *arena_data = pm_arena_memdup(parser->arena, pm_buffer_value(&token_buffer->buffer), len, PRISM_ALIGNOF(uint8_t)); + pm_string_constant_init(&parser->current_string, (const char *) arena_data, len); + pm_buffer_cleanup(&token_buffer->buffer); } -static inline void +static PRISM_INLINE void pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { - pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->base.buffer), pm_buffer_length(&token_buffer->base.buffer)); - parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p((const uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer)); - pm_buffer_free(&token_buffer->regexp_buffer); + pm_token_buffer_copy(parser, &token_buffer->base); + pm_buffer_cleanup(&token_buffer->regexp_buffer); } /** @@ -10492,10 +9833,11 @@ static void pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { if (token_buffer->base.cursor == NULL) { pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); - parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p(parser->current.start, (size_t) (parser->current.end - parser->current.start)); } else { - pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); - pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); + const uint8_t *cursor = token_buffer->base.cursor; + size_t length = (size_t) (parser->current.end - cursor); + pm_buffer_append_bytes(&token_buffer->base.buffer, cursor, length); + pm_buffer_append_bytes(&token_buffer->regexp_buffer, cursor, length); pm_regexp_token_buffer_copy(parser, token_buffer); } } @@ -10514,7 +9856,7 @@ static void pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) { const uint8_t *start; if (token_buffer->cursor == NULL) { - pm_buffer_init_capacity(&token_buffer->buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + pm_buffer_init(&token_buffer->buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); start = parser->current.start; } else { start = token_buffer->cursor; @@ -10531,8 +9873,8 @@ static void pm_regexp_token_buffer_escape(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { const uint8_t *start; if (token_buffer->base.cursor == NULL) { - pm_buffer_init_capacity(&token_buffer->base.buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); - pm_buffer_init_capacity(&token_buffer->regexp_buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + pm_buffer_init(&token_buffer->base.buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); + pm_buffer_init(&token_buffer->regexp_buffer, PM_TOKEN_BUFFER_DEFAULT_SIZE); start = parser->current.start; } else { start = token_buffer->base.cursor; @@ -10551,7 +9893,7 @@ pm_regexp_token_buffer_escape(pm_parser_t *parser, pm_regexp_token_buffer_t *tok * Effectively the same thing as pm_strspn_inline_whitespace, but in the case of * a tilde heredoc expands out tab characters to the nearest tab boundaries. */ -static inline size_t +static PRISM_INLINE size_t pm_heredoc_strspn_inline_whitespace(pm_parser_t *parser, const uint8_t **cursor, pm_heredoc_indent_t indent) { size_t whitespace = 0; @@ -10599,7 +9941,7 @@ pm_lex_percent_delimiter(pm_parser_t *parser) { parser_flush_heredoc_end(parser); } else { // Otherwise, we'll add the newline to the list of newlines. - pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + U32(eol_length)); } uint8_t delimiter = *parser->current.end; @@ -10647,6 +9989,12 @@ parser_lex(pm_parser_t *parser) { unsigned int semantic_token_seen = parser->semantic_token_seen; parser->semantic_token_seen = true; + // We'll jump to this label when we are about to encounter an EOF. + // If we still have lex_modes on the stack, we pop them so that cleanup + // can happen. For example, we should still continue parsing after a heredoc + // identifier, even if the heredoc body was syntax invalid. + switch_lex_modes: + switch (parser->lex_modes.current->mode) { case PM_LEX_DEFAULT: case PM_LEX_EMBEXPR: @@ -10669,22 +10017,29 @@ parser_lex(pm_parser_t *parser) { bool space_seen = false; // First, we're going to skip past any whitespace at the front of the next - // token. + // token. Skip runs of inline whitespace in bulk to avoid per-character + // stores back to parser->current.end. bool chomping = true; while (parser->current.end < parser->end && chomping) { - switch (*parser->current.end) { - case ' ': - case '\t': - case '\f': - case '\v': - parser->current.end++; + { + static const uint8_t inline_whitespace[256] = { + [' '] = 1, ['\t'] = 1, ['\f'] = 1, ['\v'] = 1 + }; + const uint8_t *scan = parser->current.end; + while (scan < parser->end && inline_whitespace[*scan]) scan++; + if (scan > parser->current.end) { + parser->current.end = scan; space_seen = true; - break; + continue; + } + } + + switch (*parser->current.end) { case '\r': if (match_eol_offset(parser, 1)) { chomping = false; } else { - pm_parser_warn(parser, parser->current.end, parser->current.end + 1, PM_WARN_UNEXPECTED_CARRIAGE_RETURN); + pm_parser_warn(parser, PM_TOKEN_END(parser, &parser->current), 1, PM_WARN_UNEXPECTED_CARRIAGE_RETURN); parser->current.end++; space_seen = true; } @@ -10697,7 +10052,7 @@ parser_lex(pm_parser_t *parser) { parser->heredoc_end = NULL; } else { parser->current.end += eol_length + 1; - pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current)); space_seen = true; } } else if (pm_char_is_inline_whitespace(*parser->current.end)) { @@ -10720,6 +10075,14 @@ parser_lex(pm_parser_t *parser) { // We'll check if we're at the end of the file. If we are, then we // need to return the EOF token. if (parser->current.end >= parser->end) { + // We may be missing closing tokens. We should pop modes one by one + // to do the appropriate cleanup like moving next_start for heredocs. + // Only when no mode is remaining will we actually emit the EOF token. + if (parser->lex_modes.current->mode != PM_LEX_DEFAULT) { + lex_mode_pop(parser); + goto switch_lex_modes; + } + // If we hit EOF, but the EOF came immediately after a newline, // set the start of the token to the newline. This way any EOF // errors will be reported as happening on that line rather than @@ -10791,7 +10154,7 @@ parser_lex(pm_parser_t *parser) { } if (parser->heredoc_end == NULL) { - pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current)); } } @@ -10849,14 +10212,50 @@ parser_lex(pm_parser_t *parser) { following = next_newline(following, parser->end - following); } - // If the lex state was ignored, or we hit a '.' or a '&.', - // we will lex the ignored newline + // If the lex state was ignored, we will lex the + // ignored newline. + if (lex_state_ignored_p(parser)) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + + // If we hit a '.' or a '&.' we will lex the ignored + // newline. + if (following && ( + (peek_at(parser, following) == '.') || + (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '.') + )) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lexed_comment = false; + goto lex_next_token; + } + + + // If we are parsing as CRuby 4.0 or later and we + // hit a '&&' or a '||' then we will lex the ignored + // newline. if ( - lex_state_ignored_p(parser) || - (following && ( - (peek_at(parser, following) == '.') || - (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '.') - )) + (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) && + following && ( + (peek_at(parser, following) == '&' && peek_at(parser, following + 1) == '&') || + (peek_at(parser, following) == '|' && peek_at(parser, following + 1) == '|') || + ( + peek_at(parser, following) == 'a' && + peek_at(parser, following + 1) == 'n' && + peek_at(parser, following + 2) == 'd' && + peek_at(parser, next_content + 3) != '!' && + peek_at(parser, next_content + 3) != '?' && + !char_is_identifier(parser, following + 3, parser->end - (following + 3)) + ) || + ( + peek_at(parser, following) == 'o' && + peek_at(parser, following + 1) == 'r' && + peek_at(parser, next_content + 2) != '!' && + peek_at(parser, next_content + 2) != '?' && + !char_is_identifier(parser, following + 2, parser->end - (following + 2)) + ) + ) ) { if (!lexed_comment) parser_lex_ignored_newline(parser); lexed_comment = false; @@ -10896,6 +10295,67 @@ parser_lex(pm_parser_t *parser) { parser->next_start = NULL; LEX(PM_TOKEN_AMPERSAND_DOT); } + + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { + // If we hit an && then we are in a logical chain + // and we need to return the logical operator. + if (peek_at(parser, next_content) == '&' && peek_at(parser, next_content + 1) == '&') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + LEX(PM_TOKEN_AMPERSAND_AMPERSAND); + } + + // If we hit a || then we are in a logical chain and + // we need to return the logical operator. + if (peek_at(parser, next_content) == '|' && peek_at(parser, next_content + 1) == '|') { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + LEX(PM_TOKEN_PIPE_PIPE); + } + + // If we hit an 'and' then we are in a logical chain + // and we need to return the logical operator. + if ( + peek_at(parser, next_content) == 'a' && + peek_at(parser, next_content + 1) == 'n' && + peek_at(parser, next_content + 2) == 'd' && + peek_at(parser, next_content + 3) != '!' && + peek_at(parser, next_content + 3) != '?' && + !char_is_identifier(parser, next_content + 3, parser->end - (next_content + 3)) + ) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 3; + parser->next_start = NULL; + parser->command_start = true; + LEX(PM_TOKEN_KEYWORD_AND); + } + + // If we hit a 'or' then we are in a logical chain + // and we need to return the logical operator. + if ( + peek_at(parser, next_content) == 'o' && + peek_at(parser, next_content + 1) == 'r' && + peek_at(parser, next_content + 2) != '!' && + peek_at(parser, next_content + 2) != '?' && + !char_is_identifier(parser, next_content + 2, parser->end - (next_content + 2)) + ) { + if (!lexed_comment) parser_lex_ignored_newline(parser); + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->current.start = next_content; + parser->current.end = next_content + 2; + parser->next_start = NULL; + parser->command_start = true; + LEX(PM_TOKEN_KEYWORD_OR); + } + } } // At this point we know this is a regular newline, and we can set the @@ -10910,7 +10370,7 @@ parser_lex(pm_parser_t *parser) { // , case ',': if ((parser->previous.type == PM_TOKEN_COMMA) && (parser->enclosure_nesting > 0)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARRAY_TERM, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_ARRAY_TERM, pm_token_str(parser->current.type)); } lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); @@ -11036,7 +10496,7 @@ parser_lex(pm_parser_t *parser) { } else if (lex_state_beg_p(parser)) { type = PM_TOKEN_USTAR_STAR; } else if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "**", "argument prefix"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "**", "argument prefix"); } if (lex_state_operator_p(parser)) { @@ -11061,7 +10521,7 @@ parser_lex(pm_parser_t *parser) { } else if (lex_state_beg_p(parser)) { type = PM_TOKEN_USTAR; } else if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "*", "argument prefix"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "*", "argument prefix"); } if (lex_state_operator_p(parser)) { @@ -11187,7 +10647,7 @@ parser_lex(pm_parser_t *parser) { bool ident_error = false; if (quote != PM_HEREDOC_QUOTE_NONE && !match(parser, (uint8_t) quote)) { - pm_parser_err(parser, ident_start, ident_start + ident_length, PM_ERR_HEREDOC_IDENTIFIER); + pm_parser_err(parser, U32(ident_start - parser->start), U32(ident_length), PM_ERR_HEREDOC_IDENTIFIER); ident_error = true; } @@ -11220,7 +10680,7 @@ parser_lex(pm_parser_t *parser) { } else { // Otherwise, we want to indicate that the body of the // heredoc starts on the character after the next newline. - pm_newline_list_append(&parser->newline_list, body_start); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(body_start - parser->start + 1)); body_start++; } @@ -11239,7 +10699,7 @@ parser_lex(pm_parser_t *parser) { } if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "<<", "here document"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "<<", "here document"); } if (lex_state_operator_p(parser)) { @@ -11365,7 +10825,7 @@ parser_lex(pm_parser_t *parser) { } else if (lex_state_beg_p(parser)) { type = PM_TOKEN_UAMPERSAND; } else if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "&", "argument prefix"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "&", "argument prefix"); } if (lex_state_operator_p(parser)) { @@ -11441,7 +10901,7 @@ parser_lex(pm_parser_t *parser) { } if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "+", "unary operator"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "+", "unary operator"); } lex_state_set(parser, PM_LEX_STATE_BEG); @@ -11482,7 +10942,7 @@ parser_lex(pm_parser_t *parser) { } if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "-", "unary operator"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "-", "unary operator"); } lex_state_set(parser, PM_LEX_STATE_BEG); @@ -11581,7 +11041,7 @@ parser_lex(pm_parser_t *parser) { } if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "/", "regexp literal"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "/", "regexp literal"); } if (lex_state_operator_p(parser)) { @@ -11766,7 +11226,7 @@ parser_lex(pm_parser_t *parser) { } if (ambiguous_operator_p(parser, space_seen)) { - PM_PARSER_WARN_TOKEN_FORMAT(parser, parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "%", "string literal"); + PM_PARSER_WARN_TOKEN_FORMAT(parser, &parser->current, PM_WARN_AMBIGUOUS_BINARY_OPERATOR, "%", "string literal"); } lex_state_set(parser, lex_state_operator_p(parser) ? PM_LEX_STATE_ARG : PM_LEX_STATE_BEG); @@ -11802,40 +11262,40 @@ parser_lex(pm_parser_t *parser) { // token after adding an appropriate error message. if (!width) { if (*parser->current.start >= 0x80) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *parser->current.start); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *parser->current.start); } else if (*parser->current.start == '\\') { switch (peek_at(parser, parser->current.start + 1)) { case ' ': parser->current.end++; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped space"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped space"); break; case '\f': parser->current.end++; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped form feed"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped form feed"); break; case '\t': parser->current.end++; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped horizontal tab"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped horizontal tab"); break; case '\v': parser->current.end++; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped vertical tab"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped vertical tab"); break; case '\r': if (peek_at(parser, parser->current.start + 2) != '\n') { parser->current.end++; - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped carriage return"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "escaped carriage return"); break; } PRISM_FALLTHROUGH default: - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "backslash"); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, "backslash"); break; } } else if (char_is_ascii_printable(*parser->current.start)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_PRINTABLE_CHARACTER, *parser->current.start); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_PRINTABLE_CHARACTER, *parser->current.start); } else { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_CHARACTER, *parser->current.start); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_INVALID_CHARACTER, *parser->current.start); } goto lex_next_token; @@ -11861,15 +11321,15 @@ parser_lex(pm_parser_t *parser) { // correct column information for it. const uint8_t *cursor = parser->current.end; while ((cursor = next_newline(cursor, parser->end - cursor)) != NULL) { - pm_newline_list_append(&parser->newline_list, cursor++); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(++cursor - parser->start)); } parser->current.end = parser->end; parser->current.type = PM_TOKEN___END__; parser_lex_callback(parser); - parser->data_loc.start = parser->current.start; - parser->data_loc.end = parser->current.end; + parser->data_loc.start = PM_TOKEN_START(parser, &parser->current); + parser->data_loc.length = PM_TOKEN_LENGTH(&parser->current); LEX(PM_TOKEN_EOF); } @@ -11894,7 +11354,7 @@ parser_lex(pm_parser_t *parser) { !(last_state & (PM_LEX_STATE_DOT | PM_LEX_STATE_FNAME)) && (type == PM_TOKEN_IDENTIFIER) && ((pm_parser_local_depth(parser, &parser->current) != -1) || - pm_token_is_numbered_parameter(parser->current.start, parser->current.end)) + pm_token_is_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))) ) { lex_state_set(parser, PM_LEX_STATE_END | PM_LEX_STATE_LABEL); } @@ -11922,7 +11382,7 @@ parser_lex(pm_parser_t *parser) { whitespace += 1; } } else { - whitespace = pm_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list); + whitespace = pm_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current)); } if (whitespace > 0) { @@ -12037,7 +11497,7 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_STRING_CONTENT); } else { // ... else track the newline. - pm_newline_list_append(&parser->newline_list, parser->current.end); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); } parser->current.end++; @@ -12065,7 +11525,7 @@ parser_lex(pm_parser_t *parser) { if (*breakpoint == '#') { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type == PM_TOKEN_NOT_PROVIDED) { + if (!type) { // If we haven't returned at this point then we had something // that looked like an interpolated class or instance variable // like "#@" but wasn't actually. In this case we'll just skip @@ -12170,7 +11630,13 @@ parser_lex(pm_parser_t *parser) { size_t eol_length = match_eol_at(parser, breakpoint); if (eol_length) { parser->current.end = breakpoint + eol_length; - pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + + // Track the newline if we're not in a heredoc that + // would have already have added the newline to the + // list. + if (parser->heredoc_end == NULL) { + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current)); + } } else { parser->current.end = breakpoint + 1; } @@ -12216,7 +11682,7 @@ parser_lex(pm_parser_t *parser) { // If we've hit a newline, then we need to track that in // the list of newlines. if (parser->heredoc_end == NULL) { - pm_newline_list_append(&parser->newline_list, breakpoint); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(breakpoint - parser->start + 1)); parser->current.end = breakpoint + 1; breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false); break; @@ -12264,7 +11730,7 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_STRING_CONTENT); } else { // ... else track the newline. - pm_newline_list_append(&parser->newline_list, parser->current.end); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); } parser->current.end++; @@ -12311,7 +11777,7 @@ parser_lex(pm_parser_t *parser) { // interpolation. pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type == PM_TOKEN_NOT_PROVIDED) { + if (!type) { // If we haven't returned at this point then we had // something that looked like an interpolated class or // instance variable like "#@" but wasn't actually. In @@ -12424,7 +11890,13 @@ parser_lex(pm_parser_t *parser) { size_t eol_length = match_eol_at(parser, breakpoint); if (eol_length) { parser->current.end = breakpoint + eol_length; - pm_newline_list_append(&parser->newline_list, parser->current.end - 1); + + // Track the newline if we're not in a heredoc that + // would have already have added the newline to the + // list. + if (parser->heredoc_end == NULL) { + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current)); + } } else { parser->current.end = breakpoint + 1; } @@ -12436,6 +11908,13 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_LABEL_END); } + // When the delimiter itself is a newline, we won't + // get a chance to flush heredocs in the usual places since + // the newline is already consumed. + if (term == '\n' && parser->heredoc_end) { + parser_flush_heredoc_end(parser); + } + lex_state_set(parser, PM_LEX_STATE_END); lex_mode_pop(parser); LEX(PM_TOKEN_STRING_END); @@ -12468,7 +11947,7 @@ parser_lex(pm_parser_t *parser) { // for the terminator in case the terminator is a // newline character. if (parser->heredoc_end == NULL) { - pm_newline_list_append(&parser->newline_list, breakpoint); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(breakpoint - parser->start + 1)); parser->current.end = breakpoint + 1; breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true); break; @@ -12522,7 +12001,7 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_STRING_CONTENT); } else { // ... else track the newline. - pm_newline_list_append(&parser->newline_list, parser->current.end); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, PM_TOKEN_END(parser, &parser->current) + 1); } parser->current.end++; @@ -12551,7 +12030,7 @@ parser_lex(pm_parser_t *parser) { case '#': { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type == PM_TOKEN_NOT_PROVIDED) { + if (!type) { // If we haven't returned at this point then we had something that // looked like an interpolated class or instance variable like "#@" // but wasn't actually. In this case we'll just skip to the next @@ -12651,7 +12130,7 @@ parser_lex(pm_parser_t *parser) { (memcmp(terminator_start, ident_start, ident_length) == 0) ) { if (newline != NULL) { - pm_newline_list_append(&parser->newline_list, newline); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(newline - parser->start + 1)); } parser->current.end = terminator_end; @@ -12682,7 +12161,7 @@ parser_lex(pm_parser_t *parser) { // Otherwise we'll be parsing string content. These are the places // where we need to split up the content of the heredoc. We'll use // strpbrk to find the first of these characters. - uint8_t breakpoints[] = "\r\n\\#"; + uint8_t breakpoints[PM_STRPBRK_CACHE_SIZE] = "\r\n\\#"; pm_heredoc_quote_t quote = heredoc_lex_mode->quote; if (quote == PM_HEREDOC_QUOTE_SINGLE) { @@ -12723,7 +12202,7 @@ parser_lex(pm_parser_t *parser) { LEX(PM_TOKEN_STRING_CONTENT); } - pm_newline_list_append(&parser->newline_list, breakpoint); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(breakpoint - parser->start + 1)); // If we have a - or ~ heredoc, then we can match after // some leading whitespace. @@ -12841,7 +12320,10 @@ parser_lex(pm_parser_t *parser) { // string content. if (heredoc_lex_mode->indent == PM_HEREDOC_INDENT_TILDE) { const uint8_t *end = parser->current.end; - pm_newline_list_append(&parser->newline_list, end); + + if (parser->heredoc_end == NULL) { + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(end - parser->start + 1)); + } // Here we want the buffer to only // include up to the backslash. @@ -12872,7 +12354,7 @@ parser_lex(pm_parser_t *parser) { case '#': { pm_token_type_t type = lex_interpolation(parser, breakpoint); - if (type == PM_TOKEN_NOT_PROVIDED) { + if (!type) { // If we haven't returned at this point then we had // something that looked like an interpolated class // or instance variable like "#@" but wasn't @@ -13097,7 +12579,7 @@ pm_binding_powers_t pm_binding_powers[PM_TOKEN_MAXIMUM] = { /** * Returns true if the current token is of the given type. */ -static inline bool +static PRISM_INLINE bool match1(const pm_parser_t *parser, pm_token_type_t type) { return parser->current.type == type; } @@ -13105,7 +12587,7 @@ match1(const pm_parser_t *parser, pm_token_type_t type) { /** * Returns true if the current token is of either of the given types. */ -static inline bool +static PRISM_INLINE bool match2(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2) { return match1(parser, type1) || match1(parser, type2); } @@ -13113,7 +12595,7 @@ match2(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2) /** * Returns true if the current token is any of the three given types. */ -static inline bool +static PRISM_INLINE bool match3(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3) { return match1(parser, type1) || match1(parser, type2) || match1(parser, type3); } @@ -13121,15 +12603,23 @@ match3(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, /** * Returns true if the current token is any of the four given types. */ -static inline bool +static PRISM_INLINE bool match4(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4) { return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4); } /** + * Returns true if the current token is any of the six given types. + */ +static PRISM_INLINE bool +match6(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6) { + return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6); +} + +/** * Returns true if the current token is any of the seven given types. */ -static inline bool +static PRISM_INLINE bool match7(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7) { return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7); } @@ -13137,20 +12627,12 @@ match7(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, /** * Returns true if the current token is any of the eight given types. */ -static inline bool +static PRISM_INLINE bool match8(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7, pm_token_type_t type8) { return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8); } /** - * Returns true if the current token is any of the nine given types. - */ -static inline bool -match9(const pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_token_type_t type3, pm_token_type_t type4, pm_token_type_t type5, pm_token_type_t type6, pm_token_type_t type7, pm_token_type_t type8, pm_token_type_t type9) { - return match1(parser, type1) || match1(parser, type2) || match1(parser, type3) || match1(parser, type4) || match1(parser, type5) || match1(parser, type6) || match1(parser, type7) || match1(parser, type8) || match1(parser, type9); -} - -/** * If the current token is of the specified type, lex forward by one token and * return true. Otherwise, return false. For example: * @@ -13169,7 +12651,7 @@ accept1(pm_parser_t *parser, pm_token_type_t type) { * If the current token is either of the two given types, lex forward by one * token and return true. Otherwise return false. */ -static inline bool +static PRISM_INLINE bool accept2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2) { if (match2(parser, type1, type2)) { parser_lex(parser); @@ -13194,10 +12676,10 @@ expect1(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id) { if (accept1(parser, type)) return; const uint8_t *location = parser->previous.end; - pm_parser_err(parser, location, location, diag_id); + pm_parser_err(parser, U32(location - parser->start), 0, diag_id); parser->previous.start = location; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } /** @@ -13209,10 +12691,10 @@ expect2(pm_parser_t *parser, pm_token_type_t type1, pm_token_type_t type2, pm_di if (accept2(parser, type1, type2)) return; const uint8_t *location = parser->previous.end; - pm_parser_err(parser, location, location, diag_id); + pm_parser_err(parser, U32(location - parser->start), 0, diag_id); parser->previous.start = location; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } /** @@ -13226,20 +12708,43 @@ expect1_heredoc_term(pm_parser_t *parser, const uint8_t *ident_start, size_t ide } else { pm_parser_err_heredoc_term(parser, ident_start, ident_length); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } } +/** + * A special expect1 that attaches the error to the opening token location + * rather than the current position. This is useful for errors about missing + * closing tokens, where we want to point to the line with the opening token + * (e.g., `def`, `class`, `if`, `{`) rather than the end of the file. + */ +static void +expect1_opening(pm_parser_t *parser, pm_token_type_t type, pm_diagnostic_id_t diag_id, const pm_token_t *opening) { + if (accept1(parser, type)) return; + + const uint8_t *start = opening->start; + pm_parser_err(parser, U32(start - parser->start), U32(opening->end - start), diag_id); + + parser->previous.start = parser->previous.end; + parser->previous.type = 0; +} + +/** Flags for controlling expression parsing behavior. */ +#define PM_PARSE_ACCEPTS_COMMAND_CALL ((uint8_t) 0x1) +#define PM_PARSE_ACCEPTS_LABEL ((uint8_t) 0x2) +#define PM_PARSE_ACCEPTS_DO_BLOCK ((uint8_t) 0x4) +#define PM_PARSE_IN_ENDLESS_DEF ((uint8_t) 0x8) + static pm_node_t * -parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth); +parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth); /** * This is a wrapper of parse_expression, which also checks whether the * resulting node is a value expression. */ static pm_node_t * -parse_value_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { - pm_node_t *node = parse_expression(parser, binding_power, accepts_command_call, accepts_label, diag_id, depth); +parse_value_expression(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *node = parse_expression(parser, binding_power, flags, diag_id, depth); pm_assert_value_expression(parser, node); return node; } @@ -13262,7 +12767,7 @@ parse_value_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bo * work in all cases, it may need to be refactored later. But it appears to work * for now. */ -static inline bool +static PRISM_INLINE bool token_begins_expression_p(pm_token_type_t type) { switch (type) { case PM_TOKEN_EQUAL_GREATER: @@ -13278,6 +12783,7 @@ token_begins_expression_p(pm_token_type_t type) { case PM_TOKEN_EOF: case PM_TOKEN_LAMBDA_BEGIN: case PM_TOKEN_KEYWORD_DO: + case PM_TOKEN_KEYWORD_DO_BLOCK: case PM_TOKEN_KEYWORD_DO_LOOP: case PM_TOKEN_KEYWORD_END: case PM_TOKEN_KEYWORD_ELSE: @@ -13323,14 +12829,89 @@ token_begins_expression_p(pm_token_type_t type) { * prefixed by the * operator. */ static pm_node_t * -parse_starred_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { +parse_starred_expression(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { if (accept1(parser, PM_TOKEN_USTAR)) { pm_token_t operator = parser->previous; - pm_node_t *expression = parse_value_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_splat_node_create(parser, &operator, expression); + pm_node_t *expression = parse_value_expression(parser, binding_power, (uint8_t) (flags & PM_PARSE_ACCEPTS_DO_BLOCK), PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + return UP(pm_splat_node_create(parser, &operator, expression)); } - return parse_value_expression(parser, binding_power, accepts_command_call, false, diag_id, depth); + return parse_value_expression(parser, binding_power, flags, diag_id, depth); +} + +static bool +pm_node_unreference_each(const pm_node_t *node, void *data) { + switch (PM_NODE_TYPE(node)) { + /* When we are about to destroy a set of nodes that could potentially + * contain block exits for the current scope, we need to check if they + * are contained in the list of block exits and remove them if they are. + */ + case PM_BREAK_NODE: + case PM_NEXT_NODE: + case PM_REDO_NODE: { + pm_parser_t *parser = (pm_parser_t *) data; + size_t index = 0; + + while (index < parser->current_block_exits->size) { + pm_node_t *block_exit = parser->current_block_exits->nodes[index]; + + if (block_exit == node) { + if (index + 1 < parser->current_block_exits->size) { + memmove( + &parser->current_block_exits->nodes[index], + &parser->current_block_exits->nodes[index + 1], + (parser->current_block_exits->size - index - 1) * sizeof(pm_node_t *) + ); + } + parser->current_block_exits->size--; + + /* Note returning true here because these nodes could have + * arguments that are themselves block exits. */ + return true; + } + + index++; + } + + return true; + } + /* When an implicit local variable is written to or targeted, it becomes + * a regular, named local variable. This branch removes it from the list + * of implicit parameters when that happens. */ + case PM_LOCAL_VARIABLE_READ_NODE: + case PM_IT_LOCAL_VARIABLE_READ_NODE: { + pm_parser_t *parser = (pm_parser_t *) data; + pm_node_list_t *implicit_parameters = &parser->current_scope->implicit_parameters; + + for (size_t index = 0; index < implicit_parameters->size; index++) { + if (implicit_parameters->nodes[index] == node) { + /* If the node is not the last one in the list, we need to + * shift the remaining nodes down to fill the gap. This is + * extremely unlikely to happen. */ + if (index != implicit_parameters->size - 1) { + memmove(&implicit_parameters->nodes[index], &implicit_parameters->nodes[index + 1], (implicit_parameters->size - index - 1) * sizeof(pm_node_t *)); + } + + implicit_parameters->size--; + break; + } + } + + return false; + } + default: + return true; + } +} + +/** + * When we are about to destroy a set of nodes that could potentially be + * referenced by one or more lists on the parser, then remove them from those + * lists so we don't get a use-after-free. + */ +static void +pm_node_unreference(pm_parser_t *parser, const pm_node_t *node) { + pm_visit_node(node, pm_node_unreference_each, parser); } /** @@ -13345,16 +12926,12 @@ parse_write_name(pm_parser_t *parser, pm_constant_id_t *name_field) { // append an =. pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, *name_field); size_t length = constant->length; - uint8_t *name = xcalloc(length + 1, sizeof(uint8_t)); - if (name == NULL) return; + uint8_t *name = (uint8_t *) pm_arena_alloc(parser->arena, length + 1, 1); memcpy(name, constant->start, length); name[length] = '='; - // Now switch the name to the new string. - // This silences clang analyzer warning about leak of memory pointed by `name`. - // NOLINTNEXTLINE(clang-analyzer-*) - *name_field = pm_constant_pool_insert_owned(&parser->constant_pool, name, length + 1); + *name_field = pm_constant_pool_insert_owned(&parser->metadata_arena, &parser->constant_pool, name, length + 1); } /** @@ -13376,35 +12953,10 @@ parse_unwriteable_target(pm_parser_t *parser, pm_node_t *target) { default: break; } - pm_constant_id_t name = pm_parser_constant_id_location(parser, target->location.start, target->location.end); + pm_constant_id_t name = pm_parser_constant_id_raw(parser, parser->start + PM_NODE_START(target), parser->start + PM_NODE_END(target)); pm_local_variable_target_node_t *result = pm_local_variable_target_node_create(parser, &target->location, name, 0); - pm_node_destroy(parser, target); - return (pm_node_t *) result; -} - -/** - * When an implicit local variable is written to or targeted, it becomes a - * regular, named local variable. This function removes it from the list of - * implicit parameters when that happens. - */ -static void -parse_target_implicit_parameter(pm_parser_t *parser, pm_node_t *node) { - pm_node_list_t *implicit_parameters = &parser->current_scope->implicit_parameters; - - for (size_t index = 0; index < implicit_parameters->size; index++) { - if (implicit_parameters->nodes[index] == node) { - // If the node is not the last one in the list, we need to shift the - // remaining nodes down to fill the gap. This is extremely unlikely - // to happen. - if (index != implicit_parameters->size - 1) { - memcpy(&implicit_parameters->nodes[index], &implicit_parameters->nodes[index + 1], (implicit_parameters->size - index - 1) * sizeof(pm_node_t *)); - } - - implicit_parameters->size--; - break; - } - } + return UP(result); } /** @@ -13418,7 +12970,7 @@ parse_target_implicit_parameter(pm_parser_t *parser, pm_node_t *node) { static pm_node_t * parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_parent) { switch (PM_NODE_TYPE(target)) { - case PM_MISSING_NODE: + case PM_ERROR_RECOVERY_NODE: return target; case PM_SOURCE_ENCODING_NODE: case PM_FALSE_NODE: @@ -13456,15 +13008,15 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p case PM_BACK_REFERENCE_READ_NODE: case PM_NUMBERED_REFERENCE_READ_NODE: PM_PARSER_ERR_NODE_FORMAT_CONTENT(parser, target, PM_ERR_WRITE_TARGET_READONLY); - return target; + return UP(pm_error_recovery_node_create_unexpected(parser, target)); case PM_GLOBAL_VARIABLE_READ_NODE: assert(sizeof(pm_global_variable_target_node_t) == sizeof(pm_global_variable_read_node_t)); target->type = PM_GLOBAL_VARIABLE_TARGET_NODE; return target; case PM_LOCAL_VARIABLE_READ_NODE: { - if (pm_token_is_numbered_parameter(target->location.start, target->location.end)) { - PM_PARSER_ERR_FORMAT(parser, target->location.start, target->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, target->location.start); - parse_target_implicit_parameter(parser, target); + if (pm_token_is_numbered_parameter(parser, PM_NODE_START(target), PM_NODE_LENGTH(target))) { + PM_PARSER_ERR_FORMAT(parser, PM_NODE_START(target), PM_NODE_LENGTH(target), PM_ERR_PARAMETER_NUMBERED_RESERVED, parser->start + PM_NODE_START(target)); + pm_node_unreference(parser, target); } const pm_local_variable_read_node_t *cast = (const pm_local_variable_read_node_t *) target; @@ -13479,10 +13031,9 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p } case PM_IT_LOCAL_VARIABLE_READ_NODE: { pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); - pm_node_t *node = (pm_node_t *) pm_local_variable_target_node_create(parser, &target->location, name, 0); + pm_node_t *node = UP(pm_local_variable_target_node_create(parser, &target->location, name, 0)); - parse_target_implicit_parameter(parser, target); - pm_node_destroy(parser, target); + pm_node_unreference(parser, target); return node; } @@ -13505,7 +13056,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p splat->expression = parse_target(parser, splat->expression, multiple, true); } - return (pm_node_t *) splat; + return UP(splat); } case PM_CALL_NODE: { pm_call_node_t *call = (pm_call_node_t *) target; @@ -13514,10 +13065,10 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p // target then this is either a method call or a local variable // write. if ( - (call->message_loc.start != NULL) && - (call->message_loc.end[-1] != '!') && - (call->message_loc.end[-1] != '?') && - (call->opening_loc.start == NULL) && + (call->message_loc.length > 0) && + (parser->start[call->message_loc.start + call->message_loc.length - 1] != '!') && + (parser->start[call->message_loc.start + call->message_loc.length - 1] != '?') && + (call->opening_loc.length == 0) && (call->arguments == NULL) && (call->block == NULL) ) { @@ -13531,21 +13082,19 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p // When it was parsed in the prefix position, foo was seen as a // method call with no receiver and no arguments. Now we have an // =, so we know it's a local variable write. - const pm_location_t message_loc = call->message_loc; - - pm_constant_id_t name = pm_parser_local_add_location(parser, message_loc.start, message_loc.end, 0); - pm_node_destroy(parser, target); + pm_location_t message_loc = call->message_loc; + pm_constant_id_t name = pm_parser_local_add_location(parser, &message_loc, 0); - return (pm_node_t *) pm_local_variable_target_node_create(parser, &message_loc, name, 0); + return UP(pm_local_variable_target_node_create(parser, &message_loc, name, 0)); } - if (*call->message_loc.start == '_' || parser->encoding->alnum_char(call->message_loc.start, call->message_loc.end - call->message_loc.start)) { + if (peek_at(parser, parser->start + call->message_loc.start) == '_' || parser->encoding->alnum_char(parser->start + call->message_loc.start, (ptrdiff_t) call->message_loc.length)) { if (multiple && PM_NODE_FLAG_P(call, PM_CALL_NODE_FLAGS_SAFE_NAVIGATION)) { pm_parser_err_node(parser, (const pm_node_t *) call, PM_ERR_UNEXPECTED_SAFE_NAVIGATION); } parse_write_name(parser, &call->name); - return (pm_node_t *) pm_call_target_node_create(parser, call); + return UP(pm_call_target_node_create(parser, call)); } } @@ -13553,7 +13102,7 @@ parse_target(pm_parser_t *parser, pm_node_t *target, bool multiple, bool splat_p // an aref expression, and we can transform it into an aset // expression. if (PM_NODE_FLAG_P(call, PM_CALL_NODE_FLAGS_INDEX)) { - return (pm_node_t *) pm_index_target_node_create(parser, call); + return UP(pm_index_target_node_create(parser, call)); } } PRISM_FALLTHROUGH @@ -13596,7 +13145,7 @@ parse_shareable_constant_write(pm_parser_t *parser, pm_node_t *write) { pm_shareable_constant_value_t shareable_constant = pm_parser_scope_shareable_constant_get(parser); if (shareable_constant != PM_SCOPE_SHAREABLE_CONSTANT_NONE) { - return (pm_node_t *) pm_shareable_constant_node_create(parser, write, shareable_constant); + return UP(pm_shareable_constant_node_create(parser, write, shareable_constant)); } return write; @@ -13608,16 +13157,14 @@ parse_shareable_constant_write(pm_parser_t *parser, pm_node_t *write) { static pm_node_t * parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_node_t *value) { switch (PM_NODE_TYPE(target)) { - case PM_MISSING_NODE: - pm_node_destroy(parser, value); + case PM_ERROR_RECOVERY_NODE: return target; case PM_CLASS_VARIABLE_READ_NODE: { pm_class_variable_write_node_t *node = pm_class_variable_write_node_create(parser, (pm_class_variable_read_node_t *) target, operator, value); - pm_node_destroy(parser, target); - return (pm_node_t *) node; + return UP(node); } case PM_CONSTANT_PATH_NODE: { - pm_node_t *node = (pm_node_t *) pm_constant_path_write_node_create(parser, (pm_constant_path_node_t *) target, operator, value); + pm_node_t *node = UP(pm_constant_path_write_node_create(parser, (pm_constant_path_node_t *) target, operator, value)); if (context_def_p(parser)) { pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_IN_METHOD); @@ -13626,13 +13173,12 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod return parse_shareable_constant_write(parser, node); } case PM_CONSTANT_READ_NODE: { - pm_node_t *node = (pm_node_t *) pm_constant_write_node_create(parser, (pm_constant_read_node_t *) target, operator, value); + pm_node_t *node = UP(pm_constant_write_node_create(parser, (pm_constant_read_node_t *) target, operator, value)); if (context_def_p(parser)) { pm_parser_err_node(parser, node, PM_ERR_WRITE_TARGET_IN_METHOD); } - pm_node_destroy(parser, target); return parse_shareable_constant_write(parser, node); } case PM_BACK_REFERENCE_READ_NODE: @@ -13641,45 +13187,40 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod PRISM_FALLTHROUGH case PM_GLOBAL_VARIABLE_READ_NODE: { pm_global_variable_write_node_t *node = pm_global_variable_write_node_create(parser, target, operator, value); - pm_node_destroy(parser, target); - return (pm_node_t *) node; + return UP(node); } case PM_LOCAL_VARIABLE_READ_NODE: { pm_local_variable_read_node_t *local_read = (pm_local_variable_read_node_t *) target; + pm_location_t location = target->location; pm_constant_id_t name = local_read->name; - pm_location_t name_loc = target->location; - uint32_t depth = local_read->depth; pm_scope_t *scope = pm_parser_scope_find(parser, depth); - if (pm_token_is_numbered_parameter(target->location.start, target->location.end)) { + if (pm_token_is_numbered_parameter(parser, PM_NODE_START(target), PM_NODE_LENGTH(target))) { pm_diagnostic_id_t diag_id = (scope->parameters & PM_SCOPE_PARAMETERS_NUMBERED_FOUND) ? PM_ERR_EXPRESSION_NOT_WRITABLE_NUMBERED : PM_ERR_PARAMETER_NUMBERED_RESERVED; - PM_PARSER_ERR_FORMAT(parser, target->location.start, target->location.end, diag_id, target->location.start); - parse_target_implicit_parameter(parser, target); + PM_PARSER_ERR_FORMAT(parser, PM_NODE_START(target), PM_NODE_LENGTH(target), diag_id, parser->start + PM_NODE_START(target)); + pm_node_unreference(parser, target); } pm_locals_unread(&scope->locals, name); - pm_node_destroy(parser, target); - return (pm_node_t *) pm_local_variable_write_node_create(parser, name, depth, value, &name_loc, operator); + return UP(pm_local_variable_write_node_create(parser, name, depth, value, &location, operator)); } case PM_IT_LOCAL_VARIABLE_READ_NODE: { pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); - pm_node_t *node = (pm_node_t *) pm_local_variable_write_node_create(parser, name, 0, value, &target->location, operator); + pm_node_t *node = UP(pm_local_variable_write_node_create(parser, name, 0, value, &target->location, operator)); - parse_target_implicit_parameter(parser, target); - pm_node_destroy(parser, target); + pm_node_unreference(parser, target); return node; } case PM_INSTANCE_VARIABLE_READ_NODE: { - pm_node_t *write_node = (pm_node_t *) pm_instance_variable_write_node_create(parser, (pm_instance_variable_read_node_t *) target, operator, value); - pm_node_destroy(parser, target); + pm_node_t *write_node = UP(pm_instance_variable_write_node_create(parser, (pm_instance_variable_read_node_t *) target, operator, value)); return write_node; } case PM_MULTI_TARGET_NODE: - return (pm_node_t *) pm_multi_write_node_create(parser, (pm_multi_target_node_t *) target, operator, value); + return UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) target, operator, value)); case PM_SPLAT_NODE: { pm_splat_node_t *splat = (pm_splat_node_t *) target; @@ -13688,9 +13229,9 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod } pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); - pm_multi_target_node_targets_append(parser, multi_target, (pm_node_t *) splat); + pm_multi_target_node_targets_append(parser, multi_target, UP(splat)); - return (pm_node_t *) pm_multi_write_node_create(parser, multi_target, operator, value); + return UP(pm_multi_write_node_create(parser, multi_target, operator, value)); } case PM_CALL_NODE: { pm_call_node_t *call = (pm_call_node_t *) target; @@ -13699,10 +13240,10 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod // target then this is either a method call or a local variable // write. if ( - (call->message_loc.start != NULL) && - (call->message_loc.end[-1] != '!') && - (call->message_loc.end[-1] != '?') && - (call->opening_loc.start == NULL) && + (call->message_loc.length > 0) && + (parser->start[call->message_loc.start + call->message_loc.length - 1] != '!') && + (parser->start[call->message_loc.start + call->message_loc.length - 1] != '?') && + (call->opening_loc.length == 0) && (call->arguments == NULL) && (call->block == NULL) ) { @@ -13716,19 +13257,18 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod // When it was parsed in the prefix position, foo was seen as a // method call with no receiver and no arguments. Now we have an // =, so we know it's a local variable write. - const pm_location_t message = call->message_loc; + pm_location_t message_loc = call->message_loc; - pm_parser_local_add_location(parser, message.start, message.end, 0); - pm_node_destroy(parser, target); + pm_refute_numbered_parameter(parser, message_loc.start, message_loc.length); + pm_parser_local_add_location(parser, &message_loc, 0); - pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, message.start, message.end); - target = (pm_node_t *) pm_local_variable_write_node_create(parser, constant_id, 0, value, &message, operator); + pm_constant_id_t constant_id = pm_parser_constant_id_raw(parser, parser->start + PM_LOCATION_START(&message_loc), parser->start + PM_LOCATION_END(&message_loc)); + target = UP(pm_local_variable_write_node_create(parser, constant_id, 0, value, &message_loc, operator)); - pm_refute_numbered_parameter(parser, message.start, message.end); return target; } - if (char_is_identifier_start(parser, call->message_loc.start, parser->end - call->message_loc.start)) { + if (char_is_identifier_start(parser, parser->start + call->message_loc.start, (ptrdiff_t) call->message_loc.length)) { // When we get here, we have a method call, because it was // previously marked as a method call but now we have an =. This // looks like: @@ -13742,13 +13282,14 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod pm_arguments_node_t *arguments = pm_arguments_node_create(parser); call->arguments = arguments; - pm_arguments_node_arguments_append(arguments, value); - call->base.location.end = arguments->base.location.end; + pm_arguments_node_arguments_append(parser->arena, arguments, value); + PM_NODE_LENGTH_SET_NODE(call, arguments); + call->equal_loc = TOK2LOC(parser, operator); parse_write_name(parser, &call->name); - pm_node_flag_set((pm_node_t *) call, PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); + pm_node_flag_set(UP(call), PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); - return (pm_node_t *) call; + return UP(call); } } @@ -13760,25 +13301,31 @@ parse_write(pm_parser_t *parser, pm_node_t *target, pm_token_t *operator, pm_nod call->arguments = pm_arguments_node_create(parser); } - pm_arguments_node_arguments_append(call->arguments, value); - target->location.end = value->location.end; + pm_arguments_node_arguments_append(parser->arena, call->arguments, value); + PM_NODE_LENGTH_SET_NODE(target, value); // Replace the name with "[]=". call->name = pm_parser_constant_id_constant(parser, "[]=", 3); + call->equal_loc = TOK2LOC(parser, operator); // Ensure that the arguments for []= don't contain keywords pm_index_arguments_check(parser, call->arguments, call->block); - pm_node_flag_set((pm_node_t *) call, PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); + pm_node_flag_set(UP(call), PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE | pm_implicit_array_write_flags(value, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)); return target; } - // If there are arguments on the call node, then it can't be a method - // call ending with = or a local variable write, so it must be a - // syntax error. In this case we'll fall through to our default + // If there are arguments on the call node, then it can't be a + // method call ending with = or a local variable write, so it must + // be a syntax error. In this case we'll fall through to our default // handling. We need to free the value that we parsed because there // is no way for us to attach it to the tree at this point. - pm_node_destroy(parser, value); + // + // Since it is possible for the value to contain an implicit + // parameter somewhere in its subtree, we need to walk it and remove + // any implicit parameters from the list of implicit parameters for + // the current scope. + pm_node_unreference(parser, value); } PRISM_FALLTHROUGH default: @@ -13809,11 +13356,10 @@ parse_unwriteable_write(pm_parser_t *parser, pm_node_t *target, const pm_token_t default: break; } - pm_constant_id_t name = pm_parser_local_add_location(parser, target->location.start, target->location.end, 1); + pm_constant_id_t name = pm_parser_local_add_location(parser, &target->location, 1); pm_local_variable_write_node_t *result = pm_local_variable_write_node_create(parser, name, 0, value, &target->location, equals); - pm_node_destroy(parser, target); - return (pm_node_t *) result; + return UP(result); } /** @@ -13846,35 +13392,35 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b pm_node_t *name = NULL; if (token_begins_expression_p(parser->current.type)) { - name = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + name = parse_expression(parser, binding_power, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); name = parse_target(parser, name, true, true); } - pm_node_t *splat = (pm_node_t *) pm_splat_node_create(parser, &star_operator, name); + pm_node_t *splat = UP(pm_splat_node_create(parser, &star_operator, name)); pm_multi_target_node_targets_append(parser, result, splat); has_rest = true; } else if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { context_push(parser, PM_CONTEXT_MULTI_TARGET); - pm_node_t *target = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + pm_node_t *target = parse_expression(parser, binding_power, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); target = parse_target(parser, target, true, false); pm_multi_target_node_targets_append(parser, result, target); context_pop(parser); } else if (token_begins_expression_p(parser->current.type)) { - pm_node_t *target = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + pm_node_t *target = parse_expression(parser, binding_power, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); target = parse_target(parser, target, true, false); pm_multi_target_node_targets_append(parser, result, target); } else if (!match1(parser, PM_TOKEN_EOF)) { // If we get here, then we have a trailing , in a multi target node. // We'll add an implicit rest node to represent this. - pm_node_t *rest = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous); + pm_node_t *rest = UP(pm_implicit_rest_node_create(parser, &parser->previous)); pm_multi_target_node_targets_append(parser, result, rest); break; } } - return (pm_node_t *) result; + return UP(result); } /** @@ -13884,7 +13430,13 @@ parse_targets(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t b static pm_node_t * parse_targets_validate(pm_parser_t *parser, pm_node_t *first_target, pm_binding_power_t binding_power, uint16_t depth) { pm_node_t *result = parse_targets(parser, first_target, binding_power, depth); - accept1(parser, PM_TOKEN_NEWLINE); + + // If we're inside parentheses, then we allow a newline before the + // closing parenthesis or equals sign. Outside of parentheses, a newline + // is not allowed (e.g., `a, b\n= 1, 2` is not valid). + if (context_p(parser, PM_CONTEXT_PARENS) || context_p(parser, PM_CONTEXT_MULTI_TARGET)) { + accept1(parser, PM_TOKEN_NEWLINE); + } // Ensure that we have either an = or a ) after the targets. if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { @@ -13913,7 +13465,7 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) { context_push(parser, context); while (true) { - pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); pm_statements_node_body_append(parser, statements, node, true); // If we're recovering from a syntax error, then we need to stop parsing @@ -13953,7 +13505,7 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) { // we were unable to parse an expression, then we will skip past this // token and continue parsing the statements list. Otherwise we'll add // an error and continue parsing the statements list. - if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) { + if (PM_NODE_TYPE_P(node, PM_ERROR_RECOVERY_NODE)) { parser_lex(parser); // If we are at the end of the file, then we need to stop parsing @@ -13971,13 +13523,14 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) { // This is an inlined version of accept1 because the error that we // want to add has varargs. If this happens again, we should // probably extract a helper function. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } } context_pop(parser); + bool last_value = true; switch (context) { case PM_CONTEXT_BEGIN_ENSURE: @@ -13998,23 +13551,24 @@ parse_statements(pm_parser_t *parser, pm_context_t context, uint16_t depth) { */ static void pm_hash_key_static_literals_add(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *node) { - const pm_node_t *duplicated = pm_static_literals_add(&parser->newline_list, parser->start_line, literals, node, true); + const pm_node_t *duplicated = pm_static_literals_add(&parser->line_offsets, parser->start, parser->start_line, literals, node, true); if (duplicated != NULL) { pm_buffer_t buffer = { 0 }; - pm_static_literal_inspect(&buffer, &parser->newline_list, parser->start_line, parser->encoding->name, duplicated); + pm_static_literal_inspect(&buffer, &parser->line_offsets, parser->start, parser->start_line, parser->encoding->name, duplicated); pm_diagnostic_list_append_format( + &parser->metadata_arena, &parser->warning_list, duplicated->location.start, - duplicated->location.end, + duplicated->location.length, PM_WARN_DUPLICATED_HASH_KEY, (int) pm_buffer_length(&buffer), pm_buffer_value(&buffer), - pm_newline_list_line_column(&parser->newline_list, node->location.start, parser->start_line).line + pm_line_offset_list_line_column(&parser->line_offsets, PM_NODE_START(node), parser->start_line).line ); - pm_buffer_free(&buffer); + pm_buffer_cleanup(&buffer); } } @@ -14026,14 +13580,15 @@ static void pm_when_clause_static_literals_add(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *node) { pm_node_t *previous; - if ((previous = pm_static_literals_add(&parser->newline_list, parser->start_line, literals, node, false)) != NULL) { + if ((previous = pm_static_literals_add(&parser->line_offsets, parser->start, parser->start_line, literals, node, false)) != NULL) { pm_diagnostic_list_append_format( + &parser->metadata_arena, &parser->warning_list, - node->location.start, - node->location.end, + PM_NODE_START(node), + PM_NODE_LENGTH(node), PM_WARN_DUPLICATED_WHEN_CLAUSE, - pm_newline_list_line_column(&parser->newline_list, node->location.start, parser->start_line).line, - pm_newline_list_line_column(&parser->newline_list, previous->location.start, parser->start_line).line + pm_line_offset_list_line_column(&parser->line_offsets, PM_NODE_START(node), parser->start_line).line, + pm_line_offset_list_line_column(&parser->line_offsets, PM_NODE_START(previous), parser->start_line).line ); } } @@ -14061,14 +13616,14 @@ parse_assocs(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *nod // inner hash to share the static literals with the outer // hash. parser->current_hash_keys = literals; - value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); } else if (token_begins_expression_p(parser->current.type)) { - value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT_HASH, (uint16_t) (depth + 1)); } else { pm_parser_scope_forwarding_keywords_check(parser, &operator); } - element = (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator); + element = UP(pm_assoc_splat_node_create(parser, value, &operator)); contains_keyword_splat = true; break; } @@ -14076,44 +13631,43 @@ parse_assocs(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *nod pm_token_t label = parser->current; parser_lex(parser); - pm_node_t *key = (pm_node_t *) pm_symbol_node_label_create(parser, &label); + pm_node_t *key = UP(pm_symbol_node_label_create(parser, &label)); pm_hash_key_static_literals_add(parser, literals, key); - pm_token_t operator = not_provided(parser); pm_node_t *value = NULL; if (token_begins_expression_p(parser->current.type)) { - value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_EXPRESSION_AFTER_LABEL, (uint16_t) (depth + 1)); + value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_HASH_EXPRESSION_AFTER_LABEL, (uint16_t) (depth + 1)); } else { if (parser->encoding->isupper_char(label.start, (label.end - 1) - label.start)) { pm_token_t constant = { .type = PM_TOKEN_CONSTANT, .start = label.start, .end = label.end - 1 }; - value = (pm_node_t *) pm_constant_read_node_create(parser, &constant); + value = UP(pm_constant_read_node_create(parser, &constant)); } else { int depth = -1; pm_token_t identifier = { .type = PM_TOKEN_IDENTIFIER, .start = label.start, .end = label.end - 1 }; if (identifier.end[-1] == '!' || identifier.end[-1] == '?') { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, identifier, PM_ERR_INVALID_LOCAL_VARIABLE_READ); + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &identifier, PM_ERR_INVALID_LOCAL_VARIABLE_READ); } else { depth = pm_parser_local_depth(parser, &identifier); } if (depth == -1) { - value = (pm_node_t *) pm_call_node_variable_call_create(parser, &identifier); + value = UP(pm_call_node_variable_call_create(parser, &identifier)); } else { - value = (pm_node_t *) pm_local_variable_read_node_create(parser, &identifier, (uint32_t) depth); + value = UP(pm_local_variable_read_node_create(parser, &identifier, (uint32_t) depth)); } } - value->location.end++; - value = (pm_node_t *) pm_implicit_node_create(parser, value); + value->location.length++; + value = UP(pm_implicit_node_create(parser, value)); } - element = (pm_node_t *) pm_assoc_node_create(parser, key, &operator, value); + element = UP(pm_assoc_node_create(parser, key, NULL, value)); break; } default: { - pm_node_t *key = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, true, PM_ERR_HASH_KEY, (uint16_t) (depth + 1)); + pm_node_t *key = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, PM_PARSE_ACCEPTS_DO_BLOCK | PM_PARSE_ACCEPTS_LABEL, PM_ERR_HASH_KEY, (uint16_t) (depth + 1)); // Hash keys that are strings are automatically frozen. We will // mark that here. @@ -14123,24 +13677,22 @@ parse_assocs(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *nod pm_hash_key_static_literals_add(parser, literals, key); - pm_token_t operator; - if (pm_symbol_node_label_p(key)) { - operator = not_provided(parser); - } else { + pm_token_t operator = { 0 }; + if (!pm_symbol_node_label_p(parser, key)) { expect1(parser, PM_TOKEN_EQUAL_GREATER, PM_ERR_HASH_ROCKET); operator = parser->previous; } - pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); - element = (pm_node_t *) pm_assoc_node_create(parser, key, &operator, value); + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + element = UP(pm_assoc_node_create(parser, key, NTOK2PTR(operator), value)); break; } } if (PM_NODE_TYPE_P(node, PM_HASH_NODE)) { - pm_hash_node_elements_append((pm_hash_node_t *) node, element); + pm_hash_node_elements_append(parser->arena, (pm_hash_node_t *) node, element); } else { - pm_keyword_hash_node_elements_append((pm_keyword_hash_node_t *) node, element); + pm_keyword_hash_node_elements_append(parser->arena, (pm_keyword_hash_node_t *) node, element); } // If there's no comma after the element, then we're done. @@ -14161,23 +13713,47 @@ parse_assocs(pm_parser_t *parser, pm_static_literals_t *literals, pm_node_t *nod return contains_keyword_splat; } +static PRISM_INLINE bool +argument_allowed_for_bare_hash(pm_parser_t *parser, pm_node_t *argument) { + if (pm_symbol_node_label_p(parser, argument)) { + return true; + } + + switch (PM_NODE_TYPE(argument)) { + case PM_CALL_NODE: { + pm_call_node_t *cast = (pm_call_node_t *) argument; + if (cast->opening_loc.length == 0 && cast->arguments != NULL) { + if (PM_NODE_FLAG_P(cast->arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS | PM_ARGUMENTS_NODE_FLAGS_CONTAINS_SPLAT)) { + return false; + } + if (cast->block != NULL) { + return false; + } + } + break; + } + default: break; + } + return accept1(parser, PM_TOKEN_EQUAL_GREATER); +} + /** * Append an argument to a list of arguments. */ -static inline void +static PRISM_INLINE void parse_arguments_append(pm_parser_t *parser, pm_arguments_t *arguments, pm_node_t *argument) { if (arguments->arguments == NULL) { arguments->arguments = pm_arguments_node_create(parser); } - pm_arguments_node_arguments_append(arguments->arguments, argument); + pm_arguments_node_arguments_append(parser->arena, arguments->arguments, argument); } /** * Parse a list of arguments. */ static void -parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_forwarding, pm_token_type_t terminator, uint16_t depth) { +parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_forwarding, pm_token_type_t terminator, uint8_t flags, uint16_t depth) { pm_binding_power_t binding_power = pm_binding_powers[parser->current.type].left; // First we need to check if the next token is one that could be the start @@ -14210,16 +13786,16 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for } pm_keyword_hash_node_t *hash = pm_keyword_hash_node_create(parser); - argument = (pm_node_t *) hash; + argument = UP(hash); pm_static_literals_t hash_keys = { 0 }; - bool contains_keyword_splat = parse_assocs(parser, &hash_keys, (pm_node_t *) hash, (uint16_t) (depth + 1)); + bool contains_keyword_splat = parse_assocs(parser, &hash_keys, UP(hash), (uint16_t) (depth + 1)); parse_arguments_append(parser, arguments, argument); - pm_node_flags_t flags = PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; - if (contains_keyword_splat) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; - pm_node_flag_set((pm_node_t *) arguments->arguments, flags); + pm_node_flags_t node_flags = PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; + if (contains_keyword_splat) node_flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; + pm_node_flag_set(UP(arguments->arguments), node_flags); pm_static_literals_free(&hash_keys); parsed_bare_hash = true; @@ -14232,12 +13808,12 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for pm_node_t *expression = NULL; if (token_begins_expression_p(parser->current.type)) { - expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); } else { pm_parser_scope_forwarding_block_check(parser, &operator); } - argument = (pm_node_t *) pm_block_argument_node_create(parser, &operator, expression); + argument = UP(pm_block_argument_node_create(parser, &operator, expression)); if (parsed_block_argument) { parse_arguments_append(parser, arguments, argument); } else { @@ -14257,18 +13833,18 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for if (match4(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_SEMICOLON, PM_TOKEN_BRACKET_RIGHT)) { pm_parser_scope_forwarding_positionals_check(parser, &operator); - argument = (pm_node_t *) pm_splat_node_create(parser, &operator, NULL); + argument = UP(pm_splat_node_create(parser, &operator, NULL)); if (parsed_bare_hash) { pm_parser_err_previous(parser, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT); } } else { - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT, (uint16_t) (depth + 1)); + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_SPLAT, (uint16_t) (depth + 1)); if (parsed_bare_hash) { - pm_parser_err(parser, operator.start, expression->location.end, PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT); + pm_parser_err(parser, PM_TOKEN_START(parser, &operator), PM_NODE_END(expression) - PM_TOKEN_START(parser, &operator), PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT); } - argument = (pm_node_t *) pm_splat_node_create(parser, &operator, expression); + argument = UP(pm_splat_node_create(parser, &operator, expression)); } parse_arguments_append(parser, arguments, argument); @@ -14283,26 +13859,26 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for // not actually argument forwarding but was instead a // range. pm_token_t operator = parser->previous; - pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_RANGE, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_RANGE, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); // If we parse a range, we need to validate that we // didn't accidentally violate the nonassoc rules of the // ... operator. if (PM_NODE_TYPE_P(right, PM_RANGE_NODE)) { pm_range_node_t *range = (pm_range_node_t *) right; - pm_parser_err(parser, range->operator_loc.start, range->operator_loc.end, PM_ERR_UNEXPECTED_RANGE_OPERATOR); + pm_parser_err(parser, range->operator_loc.start, range->operator_loc.length, PM_ERR_UNEXPECTED_RANGE_OPERATOR); } - argument = (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right); + argument = UP(pm_range_node_create(parser, NULL, &operator, right)); } else { pm_parser_scope_forwarding_all_check(parser, &parser->previous); if (parsed_first_argument && terminator == PM_TOKEN_EOF) { pm_parser_err_previous(parser, PM_ERR_ARGUMENT_FORWARDING_UNBOUND); } - argument = (pm_node_t *) pm_forwarding_arguments_node_create(parser, &parser->previous); + argument = UP(pm_forwarding_arguments_node_create(parser, &parser->previous)); parse_arguments_append(parser, arguments, argument); - pm_node_flag_set((pm_node_t *) arguments->arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_FORWARDING); + pm_node_flag_set(UP(arguments->arguments), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_FORWARDING); arguments->has_forwarding = true; parsed_forwarding_arguments = true; break; @@ -14312,22 +13888,20 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for PRISM_FALLTHROUGH default: { if (argument == NULL) { - argument = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, !parsed_first_argument, true, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + argument = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (!parsed_first_argument ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0u) | PM_PARSE_ACCEPTS_LABEL), PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); } bool contains_keywords = false; bool contains_keyword_splat = false; - if (pm_symbol_node_label_p(argument) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) { + if (argument_allowed_for_bare_hash(parser, argument)) { if (parsed_bare_hash) { pm_parser_err_previous(parser, PM_ERR_ARGUMENT_BARE_HASH); } - pm_token_t operator; + pm_token_t operator = { 0 }; if (parser->previous.type == PM_TOKEN_EQUAL_GREATER) { operator = parser->previous; - } else { - operator = not_provided(parser); } pm_keyword_hash_node_t *bare_hash = pm_keyword_hash_node_create(parser); @@ -14338,18 +13912,18 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for pm_hash_key_static_literals_add(parser, &hash_keys, argument); // Finish parsing the one we are part way through. - pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); - argument = (pm_node_t *) pm_assoc_node_create(parser, argument, &operator, value); + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + argument = UP(pm_assoc_node_create(parser, argument, NTOK2PTR(operator), value)); - pm_keyword_hash_node_elements_append(bare_hash, argument); - argument = (pm_node_t *) bare_hash; + pm_keyword_hash_node_elements_append(parser->arena, bare_hash, argument); + argument = UP(bare_hash); // Then parse more if we have a comma if (accept1(parser, PM_TOKEN_COMMA) && ( token_begins_expression_p(parser->current.type) || match2(parser, PM_TOKEN_USTAR_STAR, PM_TOKEN_LABEL) )) { - contains_keyword_splat = parse_assocs(parser, &hash_keys, (pm_node_t *) bare_hash, (uint16_t) (depth + 1)); + contains_keyword_splat = parse_assocs(parser, &hash_keys, UP(bare_hash), (uint16_t) (depth + 1)); } pm_static_literals_free(&hash_keys); @@ -14358,10 +13932,10 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for parse_arguments_append(parser, arguments, argument); - pm_node_flags_t flags = 0; - if (contains_keywords) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; - if (contains_keyword_splat) flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; - pm_node_flag_set((pm_node_t *) arguments->arguments, flags); + pm_node_flags_t node_flags = 0; + if (contains_keywords) node_flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS; + if (contains_keyword_splat) node_flags |= PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORD_SPLAT; + pm_node_flag_set(UP(arguments->arguments), node_flags); break; } @@ -14370,7 +13944,7 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for parsed_first_argument = true; // If parsing the argument failed, we need to stop parsing arguments. - if (PM_NODE_TYPE_P(argument, PM_MISSING_NODE) || parser->recovering) break; + if (PM_NODE_TYPE_P(argument, PM_ERROR_RECOVERY_NODE) || parser->recovering) break; // If the terminator of these arguments is not EOF, then we have a // specific token we're looking for. In that case we can accept a @@ -14390,6 +13964,17 @@ parse_arguments(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_for if (accepted_newline) { pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); } + + // If this is a command call and an argument takes a block, + // there can be no further arguments. For example, + // `foo(bar 1 do end, 2)` should be rejected. + if (PM_NODE_TYPE_P(argument, PM_CALL_NODE)) { + pm_call_node_t *call = (pm_call_node_t *) argument; + if (call->opening_loc.length == 0 && call->arguments != NULL && call->block != NULL) { + pm_parser_err_previous(parser, PM_ERR_INVALID_COMMA); + break; + } + } } else { // If there is no comma at the end of the argument list then we're // done parsing arguments and can break out of this loop. @@ -14417,7 +14002,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) { expect1(parser, PM_TOKEN_PARENTHESIS_LEFT, PM_ERR_EXPECT_LPAREN_REQ_PARAMETER); pm_multi_target_node_t *node = pm_multi_target_node_create(parser); - pm_multi_target_node_opening_set(node, &parser->previous); + pm_multi_target_node_opening_set(parser, node, &parser->previous); do { pm_node_t *param; @@ -14427,33 +14012,33 @@ parse_required_destructured_parameter(pm_parser_t *parser) { // commas, so here we'll assume this is a mistake of the user not // knowing it's not allowed here. if (node->lefts.size > 0 && match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - param = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous); + param = UP(pm_implicit_rest_node_create(parser, &parser->previous)); pm_multi_target_node_targets_append(parser, node, param); pm_parser_err_current(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); break; } if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { - param = (pm_node_t *) parse_required_destructured_parameter(parser); + param = UP(parse_required_destructured_parameter(parser)); } else if (accept1(parser, PM_TOKEN_USTAR)) { pm_token_t star = parser->previous; pm_node_t *value = NULL; if (accept1(parser, PM_TOKEN_IDENTIFIER)) { pm_token_t name = parser->previous; - value = (pm_node_t *) pm_required_parameter_node_create(parser, &name); + value = UP(pm_required_parameter_node_create(parser, &name)); if (pm_parser_parameter_name_check(parser, &name)) { pm_node_flag_set_repeated_parameter(value); } pm_parser_local_add_token(parser, &name, 1); } - param = (pm_node_t *) pm_splat_node_create(parser, &star, value); + param = UP(pm_splat_node_create(parser, &star, value)); } else { expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_EXPECT_IDENT_REQ_PARAMETER); pm_token_t name = parser->previous; - param = (pm_node_t *) pm_required_parameter_node_create(parser, &name); + param = UP(pm_required_parameter_node_create(parser, &name)); if (pm_parser_parameter_name_check(parser, &name)) { pm_node_flag_set_repeated_parameter(param); } @@ -14465,7 +14050,7 @@ parse_required_destructured_parameter(pm_parser_t *parser) { accept1(parser, PM_TOKEN_NEWLINE); expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN_REQ_PARAMETER); - pm_multi_target_node_closing_set(node, &parser->previous); + pm_multi_target_node_closing_set(parser, node, &parser->previous); return node; } @@ -14541,6 +14126,43 @@ update_parameter_state(pm_parser_t *parser, pm_token_t *token, pm_parameters_ord return true; } +static PRISM_INLINE void +parse_parameters_handle_trailing_comma( + pm_parser_t *parser, + pm_parameters_node_t *params, + pm_parameters_order_t order, + bool in_block, + bool allows_trailing_comma +) { + if (!allows_trailing_comma) { + pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); + return; + } + + if (in_block) { + if (order >= PM_PARAMETERS_ORDER_NAMED) { + // foo do |bar,|; end + pm_node_t *param = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + + if (params->rest == NULL) { + pm_parameters_node_rest_set(params, param); + } else { + pm_parser_err_node(parser, UP(param), PM_ERR_PARAMETER_SPLAT_MULTI); + pm_parameters_node_posts_append(parser->arena, params, UP(param)); + } + } else { + // foo do |*bar,|; end + pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); + } + } else { + // https://bugs.ruby-lang.org/issues/19107 + // Allow `def foo(bar,); end`, `def foo(*bar,); end`, etc. but not `def foo(...,); end` + if (parser->version < PM_OPTIONS_VERSION_CRUBY_4_1 || order == PM_PARAMETERS_ORDER_NOTHING_AFTER) { + pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); + } + } +} + /** * Parse a list of parameters on a method definition. */ @@ -14553,6 +14175,7 @@ parse_parameters( bool allows_forwarding_parameters, bool accepts_blocks_in_defaults, bool in_block, + pm_diagnostic_id_t diag_id_forwarding, uint16_t depth ) { pm_do_loop_stack_push(parser, false); @@ -14566,12 +14189,12 @@ parse_parameters( switch (parser->current.type) { case PM_TOKEN_PARENTHESIS_LEFT: { update_parameter_state(parser, &parser->current, &order); - pm_node_t *param = (pm_node_t *) parse_required_destructured_parameter(parser); + pm_node_t *param = UP(parse_required_destructured_parameter(parser)); if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) { - pm_parameters_node_requireds_append(params, param); + pm_parameters_node_requireds_append(parser->arena, params, param); } else { - pm_parameters_node_posts_append(params, param); + pm_parameters_node_posts_append(parser->arena, params, param); } break; } @@ -14581,34 +14204,40 @@ parse_parameters( parser_lex(parser); pm_token_t operator = parser->previous; - pm_token_t name; + pm_node_t *param; - bool repeated = false; - if (accept1(parser, PM_TOKEN_IDENTIFIER)) { - name = parser->previous; - repeated = pm_parser_parameter_name_check(parser, &name); - pm_parser_local_add_token(parser, &name, 1); + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1 && accept1(parser, PM_TOKEN_KEYWORD_NIL)) { + param = (pm_node_t *) pm_no_block_parameter_node_create(parser, &operator, &parser->previous); } else { - name = not_provided(parser); - parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_BLOCK; - } + pm_token_t name = {0}; - pm_block_parameter_node_t *param = pm_block_parameter_node_create(parser, &name, &operator); - if (repeated) { - pm_node_flag_set_repeated_parameter((pm_node_t *)param); + bool repeated = false; + if (accept1(parser, PM_TOKEN_IDENTIFIER)) { + name = parser->previous; + repeated = pm_parser_parameter_name_check(parser, &name); + pm_parser_local_add_token(parser, &name, 1); + } else { + parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_BLOCK; + } + + param = (pm_node_t *) pm_block_parameter_node_create(parser, NTOK2PTR(name), &operator); + if (repeated) { + pm_node_flag_set_repeated_parameter(param); + } } + if (params->block == NULL) { pm_parameters_node_block_set(params, param); } else { - pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_BLOCK_MULTI); - pm_parameters_node_posts_append(params, (pm_node_t *) param); + pm_parser_err_node(parser, param, PM_ERR_PARAMETER_BLOCK_MULTI); + pm_parameters_node_posts_append(parser->arena, params, UP(pm_error_recovery_node_create_unexpected(parser, param))); } break; } case PM_TOKEN_UDOT_DOT_DOT: { if (!allows_forwarding_parameters) { - pm_parser_err_current(parser, PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES); + pm_parser_err_current(parser, diag_id_forwarding); } bool succeeded = update_parameter_state(parser, &parser->current, &order); @@ -14621,12 +14250,12 @@ parse_parameters( // If we already have a keyword rest parameter, then we replace it with the // forwarding parameter and move the keyword rest parameter to the posts list. pm_node_t *keyword_rest = params->keyword_rest; - pm_parameters_node_posts_append(params, keyword_rest); + pm_parameters_node_posts_append(parser->arena, params, UP(pm_error_recovery_node_create_unexpected(parser, keyword_rest))); if (succeeded) pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_FWD); params->keyword_rest = NULL; } - pm_parameters_node_keyword_rest_set(params, (pm_node_t *) param); + pm_parameters_node_keyword_rest_set(params, UP(param)); break; } case PM_TOKEN_CLASS_VARIABLE: @@ -14671,24 +14300,24 @@ parse_parameters( parser_lex(parser); pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &name); - uint32_t reads = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; + uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true); - pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT, (uint16_t) (depth + 1)); + pm_node_t *value = parse_value_expression(parser, binding_power, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_PARAMETER_NO_DEFAULT, (uint16_t) (depth + 1)); if (accepts_blocks_in_defaults) pm_accepts_block_stack_pop(parser); pm_optional_parameter_node_t *param = pm_optional_parameter_node_create(parser, &name, &operator, value); if (repeated) { - pm_node_flag_set_repeated_parameter((pm_node_t *) param); + pm_node_flag_set_repeated_parameter(UP(param)); } - pm_parameters_node_optionals_append(params, param); + pm_parameters_node_optionals_append(parser->arena, params, param); // If the value of the parameter increased the number of // reads of that parameter, then we need to warn that we // have a circular definition. - if ((parser->version == PM_OPTIONS_VERSION_CRUBY_3_3) && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, name, PM_ERR_PARAMETER_CIRCULAR); + if ((parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3) && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &name, PM_ERR_PARAMETER_CIRCULAR); } context_pop(parser); @@ -14703,15 +14332,15 @@ parse_parameters( } else if (order > PM_PARAMETERS_ORDER_AFTER_OPTIONAL) { pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name); if (repeated) { - pm_node_flag_set_repeated_parameter((pm_node_t *)param); + pm_node_flag_set_repeated_parameter(UP(param)); } - pm_parameters_node_requireds_append(params, (pm_node_t *) param); + pm_parameters_node_requireds_append(parser->arena, params, UP(param)); } else { pm_required_parameter_node_t *param = pm_required_parameter_node_create(parser, &name); if (repeated) { - pm_node_flag_set_repeated_parameter((pm_node_t *)param); + pm_node_flag_set_repeated_parameter(UP(param)); } - pm_parameters_node_posts_append(params, (pm_node_t *) param); + pm_parameters_node_posts_append(parser->arena, params, UP(param)); } break; @@ -14728,9 +14357,9 @@ parse_parameters( local.end -= 1; if (parser->encoding_changed ? parser->encoding->isupper_char(local.start, local.end - local.start) : pm_encoding_utf_8_isupper_char(local.start, local.end - local.start)) { - pm_parser_err(parser, local.start, local.end, PM_ERR_ARGUMENT_FORMAL_CONSTANT); + pm_parser_err(parser, PM_TOKEN_START(parser, &local), PM_TOKEN_LENGTH(&local), PM_ERR_ARGUMENT_FORMAL_CONSTANT); } else if (local.end[-1] == '!' || local.end[-1] == '?') { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, local, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE); + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &local, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE); } bool repeated = pm_parser_parameter_name_check(parser, &local); @@ -14742,12 +14371,12 @@ parse_parameters( case PM_TOKEN_PIPE: { context_pop(parser); - pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name); + pm_node_t *param = UP(pm_required_keyword_parameter_node_create(parser, &name)); if (repeated) { pm_node_flag_set_repeated_parameter(param); } - pm_parameters_node_keywords_append(params, param); + pm_parameters_node_keywords_append(parser->arena, params, param); break; } case PM_TOKEN_SEMICOLON: @@ -14759,12 +14388,12 @@ parse_parameters( break; } - pm_node_t *param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name); + pm_node_t *param = UP(pm_required_keyword_parameter_node_create(parser, &name)); if (repeated) { pm_node_flag_set_repeated_parameter(param); } - pm_parameters_node_keywords_append(params, param); + pm_parameters_node_keywords_append(parser->arena, params, param); break; } default: { @@ -14772,20 +14401,20 @@ parse_parameters( if (token_begins_expression_p(parser->current.type)) { pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &local); - uint32_t reads = parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; + uint32_t reads = parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 ? pm_locals_reads(&parser->current_scope->locals, name_id) : 0; if (accepts_blocks_in_defaults) pm_accepts_block_stack_push(parser, true); - pm_node_t *value = parse_value_expression(parser, binding_power, false, false, PM_ERR_PARAMETER_NO_DEFAULT_KW, (uint16_t) (depth + 1)); + pm_node_t *value = parse_value_expression(parser, binding_power, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_PARAMETER_NO_DEFAULT_KW, (uint16_t) (depth + 1)); if (accepts_blocks_in_defaults) pm_accepts_block_stack_pop(parser); - if (parser->version == PM_OPTIONS_VERSION_CRUBY_3_3 && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, local, PM_ERR_PARAMETER_CIRCULAR); + if (parser->version <= PM_OPTIONS_VERSION_CRUBY_3_3 && (pm_locals_reads(&parser->current_scope->locals, name_id) != reads)) { + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &local, PM_ERR_PARAMETER_CIRCULAR); } - param = (pm_node_t *) pm_optional_keyword_parameter_node_create(parser, &name, value); + param = UP(pm_optional_keyword_parameter_node_create(parser, &name, value)); } else { - param = (pm_node_t *) pm_required_keyword_parameter_node_create(parser, &name); + param = UP(pm_required_keyword_parameter_node_create(parser, &name)); } if (repeated) { @@ -14793,7 +14422,7 @@ parse_parameters( } context_pop(parser); - pm_parameters_node_keywords_append(params, param); + pm_parameters_node_keywords_append(parser->arena, params, param); // If parsing the value of the parameter resulted in error recovery, // then we can put a missing node in its place and stop parsing the @@ -14814,7 +14443,7 @@ parse_parameters( parser_lex(parser); pm_token_t operator = parser->previous; - pm_token_t name; + pm_token_t name = { 0 }; bool repeated = false; if (accept1(parser, PM_TOKEN_IDENTIFIER)) { @@ -14822,11 +14451,10 @@ parse_parameters( repeated = pm_parser_parameter_name_check(parser, &name); pm_parser_local_add_token(parser, &name, 1); } else { - name = not_provided(parser); parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS; } - pm_node_t *param = (pm_node_t *) pm_rest_parameter_node_create(parser, &operator, &name); + pm_node_t *param = UP(pm_rest_parameter_node_create(parser, &operator, NTOK2PTR(name))); if (repeated) { pm_node_flag_set_repeated_parameter(param); } @@ -14835,7 +14463,7 @@ parse_parameters( pm_parameters_node_rest_set(params, param); } else { pm_parser_err_node(parser, param, PM_ERR_PARAMETER_SPLAT_MULTI); - pm_parameters_node_posts_append(params, param); + pm_parameters_node_posts_append(parser->arena, params, param); } break; @@ -14854,9 +14482,9 @@ parse_parameters( pm_parser_err_previous(parser, PM_ERR_PARAMETER_UNEXPECTED_NO_KW); } - param = (pm_node_t *) pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous); + param = UP(pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous)); } else { - pm_token_t name; + pm_token_t name = { 0 }; bool repeated = false; if (accept1(parser, PM_TOKEN_IDENTIFIER)) { @@ -14864,11 +14492,10 @@ parse_parameters( repeated = pm_parser_parameter_name_check(parser, &name); pm_parser_local_add_token(parser, &name, 1); } else { - name = not_provided(parser); parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS; } - param = (pm_node_t *) pm_keyword_rest_parameter_node_create(parser, &operator, &name); + param = UP(pm_keyword_rest_parameter_node_create(parser, &operator, NTOK2PTR(name))); if (repeated) { pm_node_flag_set_repeated_parameter(param); } @@ -14878,27 +14505,14 @@ parse_parameters( pm_parameters_node_keyword_rest_set(params, param); } else { pm_parser_err_node(parser, param, PM_ERR_PARAMETER_ASSOC_SPLAT_MULTI); - pm_parameters_node_posts_append(params, param); + pm_parameters_node_posts_append(parser->arena, params, UP(pm_error_recovery_node_create_unexpected(parser, param))); } break; } default: if (parser->previous.type == PM_TOKEN_COMMA) { - if (allows_trailing_comma && order >= PM_PARAMETERS_ORDER_NAMED) { - // If we get here, then we have a trailing comma in a - // block parameter list. - pm_node_t *param = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous); - - if (params->rest == NULL) { - pm_parameters_node_rest_set(params, param); - } else { - pm_parser_err_node(parser, (pm_node_t *) param, PM_ERR_PARAMETER_SPLAT_MULTI); - pm_parameters_node_posts_append(params, (pm_node_t *) param); - } - } else { - pm_parser_err_previous(parser, PM_ERR_PARAMETER_WILD_LOOSE_COMMA); - } + parse_parameters_handle_trailing_comma(parser, params, order, in_block, allows_trailing_comma); } parsing = false; @@ -14930,8 +14544,7 @@ parse_parameters( pm_do_loop_stack_pop(parser); // If we don't have any parameters, return `NULL` instead of an empty `ParametersNode`. - if (params->base.location.start == params->base.location.end) { - pm_node_destroy(parser, (pm_node_t *) params); + if (PM_NODE_START(params) == PM_NODE_END(params)) { return NULL; } @@ -14948,13 +14561,13 @@ token_newline_index(const pm_parser_t *parser) { // This is the common case. In this case we can look at the previously // recorded newline in the newline list and subtract from the current // offset. - return parser->newline_list.size - 1; + return parser->line_offsets.size - 1; } else { // This is unlikely. This is the case that we have already parsed the // start of a heredoc, so we cannot rely on looking at the previous // offset of the newline list, and instead must go through the whole // process of a binary search for the line number. - return (size_t) pm_newline_list_line(&parser->newline_list, parser->current.start, 0); + return (size_t) pm_line_offset_list_line(&parser->line_offsets, PM_TOKEN_START(parser, &parser->current), 0); } } @@ -14964,7 +14577,7 @@ token_newline_index(const pm_parser_t *parser) { */ static int64_t token_column(const pm_parser_t *parser, size_t newline_index, const pm_token_t *token, bool break_on_non_space) { - const uint8_t *cursor = parser->start + parser->newline_list.offsets[newline_index]; + const uint8_t *cursor = parser->start + parser->line_offsets.offsets[newline_index]; const uint8_t *end = token->start; // Skip over the BOM if it is present. @@ -15028,8 +14641,8 @@ parser_warn_indentation_mismatch(pm_parser_t *parser, size_t opening_newline_ind // Otherwise, add a warning. PM_PARSER_WARN_FORMAT( parser, - closing_token->start, - closing_token->end, + PM_TOKEN_START(parser, closing_token), + PM_TOKEN_LENGTH(closing_token), PM_WARN_INDENTATION_MISMATCH, (int) (closing_token->end - closing_token->start), (const char *) closing_token->start, @@ -15053,7 +14666,7 @@ typedef enum { * Parse any number of rescue clauses. This will form a linked list of if * nodes pointing to each other from the top. */ -static inline void +static PRISM_INLINE void parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_t *opening, pm_begin_node_t *parent_node, pm_rescues_type_t type, uint16_t depth) { pm_rescue_node_t *current = NULL; @@ -15069,9 +14682,9 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ // we're going to have an empty list of exceptions to rescue (which // implies StandardError). parser_lex(parser); - pm_rescue_node_operator_set(rescue, &parser->previous); + pm_rescue_node_operator_set(parser, rescue, &parser->previous); - pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); + pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); reference = parse_target(parser, reference, false, false); pm_rescue_node_reference_set(rescue, reference); @@ -15090,7 +14703,7 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ do { pm_node_t *expression = parse_starred_expression(parser, PM_BINDING_POWER_DEFINED, false, PM_ERR_RESCUE_EXPRESSION, (uint16_t) (depth + 1)); - pm_rescue_node_exceptions_append(rescue, expression); + pm_rescue_node_exceptions_append(parser->arena, rescue, expression); // If we hit a newline, then this is the end of the rescue expression. We // can continue on to parse the statements. @@ -15099,9 +14712,9 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ // If we hit a `=>` then we're going to parse the exception variable. Once // we've done that, we'll break out of the loop and parse the statements. if (accept1(parser, PM_TOKEN_EQUAL_GREATER)) { - pm_rescue_node_operator_set(rescue, &parser->previous); + pm_rescue_node_operator_set(parser, rescue, &parser->previous); - pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); + pm_node_t *reference = parse_expression(parser, PM_BINDING_POWER_INDEX, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_VARIABLE, (uint16_t) (depth + 1)); reference = parse_target(parser, reference, false, false); pm_rescue_node_reference_set(rescue, reference); @@ -15114,11 +14727,11 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { - rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous); + rescue->then_keyword_loc = TOK2LOC(parser, &parser->previous); } } else { expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_RESCUE_TERM); - rescue->then_keyword_loc = PM_OPTIONAL_LOCATION_TOKEN_VALUE(&parser->previous); + rescue->then_keyword_loc = TOK2LOC(parser, &parser->previous); } if (!match3(parser, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_END)) { @@ -15156,11 +14769,10 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ // since we won't know the end until we've found all subsequent // clauses. This sets the end location on all rescues once we know it. if (current != NULL) { - const uint8_t *end_to_set = current->base.location.end; pm_rescue_node_t *clause = parent_node->rescue_clause; while (clause != NULL) { - clause->base.location.end = end_to_set; + PM_NODE_LENGTH_SET_NODE(clause, current); clause = clause->subsequent; } } @@ -15203,7 +14815,7 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ // If we don't have a `current` rescue node, then this is a dangling // else, and it's an error. - if (current == NULL) pm_parser_err_node(parser, (pm_node_t *) else_clause, PM_ERR_BEGIN_LONELY_ELSE); + if (current == NULL) pm_parser_err_node(parser, UP(else_clause), PM_ERR_BEGIN_LONELY_ELSE); } if (match1(parser, PM_TOKEN_KEYWORD_ENSURE)) { @@ -15241,10 +14853,10 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ if (match1(parser, PM_TOKEN_KEYWORD_END)) { if (opening != NULL) parser_warn_indentation_mismatch(parser, opening_newline_index, opening, false, false); - pm_begin_node_end_keyword_set(parent_node, &parser->current); + pm_begin_node_end_keyword_set(parser, parent_node, &parser->current); } else { - pm_token_t end_keyword = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; - pm_begin_node_end_keyword_set(parent_node, &end_keyword); + pm_token_t end_keyword = (pm_token_t) { .type = PM_TOKEN_KEYWORD_END, .start = parser->previous.end, .end = parser->previous.end }; + pm_begin_node_end_keyword_set(parser, parent_node, &end_keyword); } } @@ -15254,11 +14866,11 @@ parse_rescues(pm_parser_t *parser, size_t opening_newline_index, const pm_token_ */ static pm_begin_node_t * parse_rescues_implicit_begin(pm_parser_t *parser, size_t opening_newline_index, const pm_token_t *opening, const uint8_t *start, pm_statements_node_t *statements, pm_rescues_type_t type, uint16_t depth) { - pm_token_t begin_keyword = not_provided(parser); - pm_begin_node_t *node = pm_begin_node_create(parser, &begin_keyword, statements); - + pm_begin_node_t *node = pm_begin_node_create(parser, NULL, statements); parse_rescues(parser, opening_newline_index, opening, node, type, (uint16_t) (depth + 1)); - node->base.location.start = start; + + node->base.location.start = U32(start - parser->start); + PM_NODE_LENGTH_SET_TOKEN(parser, node, &parser->current); return node; } @@ -15277,6 +14889,9 @@ parse_block_parameters( ) { pm_parameters_node_t *parameters = NULL; if (!match1(parser, PM_TOKEN_SEMICOLON)) { + if (!is_lambda_literal) { + context_push(parser, PM_CONTEXT_BLOCK_PARAMETERS); + } parameters = parse_parameters( parser, is_lambda_literal ? PM_BINDING_POWER_DEFINED : PM_BINDING_POWER_INDEX, @@ -15285,12 +14900,16 @@ parse_block_parameters( false, accepts_blocks_in_defaults, true, + is_lambda_literal ? PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES_LAMBDA : PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES_BLOCK, (uint16_t) (depth + 1) ); + if (!is_lambda_literal) { + context_pop(parser); + } } pm_block_parameters_node_t *block_parameters = pm_block_parameters_node_create(parser, parameters, opening); - if ((opening->type != PM_TOKEN_NOT_PROVIDED)) { + if (opening != NULL) { accept1(parser, PM_TOKEN_NEWLINE); if (accept1(parser, PM_TOKEN_SEMICOLON)) { @@ -15321,9 +14940,9 @@ parse_block_parameters( pm_parser_local_add_token(parser, &parser->previous, 1); pm_block_local_variable_node_t *local = pm_block_local_variable_node_create(parser, &parser->previous); - if (repeated) pm_node_flag_set_repeated_parameter((pm_node_t *) local); + if (repeated) pm_node_flag_set_repeated_parameter(UP(local)); - pm_block_parameters_node_append_local(block_parameters, local); + pm_block_parameters_node_append_local(parser->arena, block_parameters, local); } while (accept1(parser, PM_TOKEN_COMMA)); } } @@ -15403,8 +15022,8 @@ parse_blocklike_parameters(pm_parser_t *parser, pm_node_t *parameters, const pm_ pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_OUTER_BLOCK); } else if (parser->current_scope->parameters & PM_SCOPE_PARAMETERS_NUMBERED_INNER) { pm_parser_err_node(parser, node, PM_ERR_NUMBERED_PARAMETER_INNER_BLOCK); - } else if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { - numbered_parameter = MAX(numbered_parameter, (uint8_t) (node->location.start[1] - '0')); + } else if (pm_token_is_numbered_parameter(parser, PM_NODE_START(node), PM_NODE_LENGTH(node))) { + numbered_parameter = MAX(numbered_parameter, (uint8_t) (parser->start[node->location.start + 1] - '0')); } else { assert(false && "unreachable"); } @@ -15423,13 +15042,11 @@ parse_blocklike_parameters(pm_parser_t *parser, pm_node_t *parameters, const pm_ for (pm_scope_t *scope = parser->current_scope->previous; scope != NULL && !scope->closed; scope = scope->previous) { scope->parameters |= PM_SCOPE_PARAMETERS_NUMBERED_INNER; } - - const pm_location_t location = { .start = opening->start, .end = closing->end }; - return (pm_node_t *) pm_numbered_parameters_node_create(parser, &location, numbered_parameter); + return UP(pm_numbered_parameters_node_create(parser, opening, closing, numbered_parameter)); } if (it_parameter) { - return (pm_node_t *) pm_it_parameters_node_create(parser, opening, closing); + return UP(pm_it_parameters_node_create(parser, opening, closing)); } return NULL; @@ -15461,7 +15078,7 @@ parse_block(pm_parser_t *parser, uint16_t depth) { expect1(parser, PM_TOKEN_PIPE, PM_ERR_BLOCK_PARAM_PIPE_TERM); } - pm_block_parameters_node_closing_set(block_parameters, &parser->previous); + pm_block_parameters_node_closing_set(parser, block_parameters, &parser->previous); } accept1(parser, PM_TOKEN_NEWLINE); @@ -15469,30 +15086,30 @@ parse_block(pm_parser_t *parser, uint16_t depth) { if (opening.type == PM_TOKEN_BRACE_LEFT) { if (!match1(parser, PM_TOKEN_BRACE_RIGHT)) { - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_BLOCK_BRACES, (uint16_t) (depth + 1)); + statements = UP(parse_statements(parser, PM_CONTEXT_BLOCK_BRACES, (uint16_t) (depth + 1))); } - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BLOCK_TERM_BRACE); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BLOCK_TERM_BRACE, &opening); } else { if (!match1(parser, PM_TOKEN_KEYWORD_END)) { if (!match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_ENSURE)) { pm_accepts_block_stack_push(parser, true); - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_BLOCK_KEYWORDS, (uint16_t) (depth + 1)); + statements = UP(parse_statements(parser, PM_CONTEXT_BLOCK_KEYWORDS, (uint16_t) (depth + 1))); pm_accepts_block_stack_pop(parser); } if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = (pm_node_t *) parse_rescues_implicit_begin(parser, 0, NULL, opening.start, (pm_statements_node_t *) statements, PM_RESCUES_BLOCK, (uint16_t) (depth + 1)); + statements = UP(parse_rescues_implicit_begin(parser, 0, NULL, opening.start, (pm_statements_node_t *) statements, PM_RESCUES_BLOCK, (uint16_t) (depth + 1))); } } - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BLOCK_TERM_END); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BLOCK_TERM_END, &opening); } pm_constant_id_list_t locals; pm_locals_order(parser, &parser->current_scope->locals, &locals, pm_parser_scope_toplevel_p(parser)); - pm_node_t *parameters = parse_blocklike_parameters(parser, (pm_node_t *) block_parameters, &opening, &parser->previous); + pm_node_t *parameters = parse_blocklike_parameters(parser, UP(block_parameters), &opening, &parser->previous); pm_parser_scope_pop(parser); pm_accepts_block_stack_pop(parser); @@ -15506,42 +15123,54 @@ parse_block(pm_parser_t *parser, uint16_t depth) { * arguments, or blocks). */ static bool -parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_block, bool accepts_command_call, uint16_t depth) { +parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accepts_block, uint8_t flags, uint16_t depth) { + /* Fast path: if the current token can't begin an expression and isn't + * a parenthesis, block opener, or splat/block-pass operator, there are + * no arguments to parse. */ + if ( + !token_begins_expression_p(parser->current.type) && + !match6(parser, PM_TOKEN_PARENTHESIS_LEFT, PM_TOKEN_KEYWORD_DO, PM_TOKEN_KEYWORD_DO_BLOCK, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR, PM_TOKEN_UAMPERSAND) + ) { + return false; + } + bool found = false; + bool parsed_command_args = false; if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { found |= true; - arguments->opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments->opening_loc = TOK2LOC(parser, &parser->previous); if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - arguments->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments->closing_loc = TOK2LOC(parser, &parser->previous); } else { pm_accepts_block_stack_push(parser, true); - parse_arguments(parser, arguments, accepts_block, PM_TOKEN_PARENTHESIS_RIGHT, (uint16_t) (depth + 1)); + parse_arguments(parser, arguments, accepts_block, PM_TOKEN_PARENTHESIS_RIGHT, (uint8_t) (flags & ~PM_PARSE_ACCEPTS_DO_BLOCK), (uint16_t) (depth + 1)); if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARGUMENT_TERM_PAREN, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_ARGUMENT_TERM_PAREN, pm_token_str(parser->current.type)); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } pm_accepts_block_stack_pop(parser); - arguments->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments->closing_loc = TOK2LOC(parser, &parser->previous); } - } else if (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR, PM_TOKEN_UAMPERSAND)) && !match1(parser, PM_TOKEN_BRACE_LEFT)) { + } else if ((flags & PM_PARSE_ACCEPTS_COMMAND_CALL) && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR, PM_TOKEN_UAMPERSAND)) && !match1(parser, PM_TOKEN_BRACE_LEFT)) { found |= true; + parsed_command_args = true; pm_accepts_block_stack_push(parser, false); // If we get here, then the subsequent token cannot be used as an infix // operator. In this case we assume the subsequent token is part of an // argument to this method call. - parse_arguments(parser, arguments, accepts_block, PM_TOKEN_EOF, (uint16_t) (depth + 1)); + parse_arguments(parser, arguments, accepts_block, PM_TOKEN_EOF, flags, (uint16_t) (depth + 1)); // If we have done with the arguments and still not consumed the comma, // then we have a trailing comma where we need to check whether it is // allowed or not. if (parser->previous.type == PM_TOKEN_COMMA && !match1(parser, PM_TOKEN_SEMICOLON)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_EXPECT_ARGUMENT, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, PM_ERR_EXPECT_ARGUMENT, pm_token_str(parser->current.type)); } pm_accepts_block_stack_pop(parser); @@ -15560,21 +15189,24 @@ parse_arguments_list(pm_parser_t *parser, pm_arguments_t *arguments, bool accept } else if (pm_accepts_block_stack_p(parser) && accept1(parser, PM_TOKEN_KEYWORD_DO)) { found |= true; block = parse_block(parser, (uint16_t) (depth + 1)); + } else if (parsed_command_args && pm_accepts_block_stack_p(parser) && (flags & PM_PARSE_ACCEPTS_DO_BLOCK) && accept1(parser, PM_TOKEN_KEYWORD_DO_BLOCK)) { + found |= true; + block = parse_block(parser, (uint16_t) (depth + 1)); } if (block != NULL) { if (arguments->block == NULL && !arguments->has_forwarding) { - arguments->block = (pm_node_t *) block; + arguments->block = UP(block); } else { - pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_BLOCK_MULTI); + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_BLOCK_MULTI); if (arguments->block != NULL) { if (arguments->arguments == NULL) { arguments->arguments = pm_arguments_node_create(parser); } - pm_arguments_node_arguments_append(arguments->arguments, arguments->block); + pm_arguments_node_arguments_append(parser->arena, arguments->arguments, arguments->block); } - arguments->block = (pm_node_t *) block; + arguments->block = UP(block); } } } @@ -15642,6 +15274,7 @@ parse_return(pm_parser_t *parser, pm_node_t *node) { case PM_CONTEXT_BLOCK_ENSURE: case PM_CONTEXT_BLOCK_KEYWORDS: case PM_CONTEXT_BLOCK_RESCUE: + case PM_CONTEXT_BLOCK_PARAMETERS: case PM_CONTEXT_DEF_ELSE: case PM_CONTEXT_DEF_ENSURE: case PM_CONTEXT_DEF_PARAMS: @@ -15661,7 +15294,7 @@ parse_return(pm_parser_t *parser, pm_node_t *node) { break; } } - if (in_sclass) { + if (in_sclass && parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) { pm_parser_err_node(parser, node, PM_ERR_RETURN_INVALID); } } @@ -15678,6 +15311,7 @@ parse_block_exit(pm_parser_t *parser, pm_node_t *node) { case PM_CONTEXT_BLOCK_KEYWORDS: case PM_CONTEXT_BLOCK_ELSE: case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_BLOCK_PARAMETERS: case PM_CONTEXT_BLOCK_RESCUE: case PM_CONTEXT_DEFINED: case PM_CONTEXT_FOR: @@ -15687,12 +15321,19 @@ parse_block_exit(pm_parser_t *parser, pm_node_t *node) { case PM_CONTEXT_LAMBDA_ENSURE: case PM_CONTEXT_LAMBDA_RESCUE: case PM_CONTEXT_LOOP_PREDICATE: - case PM_CONTEXT_POSTEXE: case PM_CONTEXT_UNTIL: case PM_CONTEXT_WHILE: // These are the good cases. We're allowed to have a block exit // in these contexts. return; + case PM_CONTEXT_POSTEXE: + // https://bugs.ruby-lang.org/issues/20409 + if (context_node->context == PM_CONTEXT_POSTEXE) { + if (parser->version < PM_OPTIONS_VERSION_CRUBY_4_1) { + return; + } + } + PRISM_FALLTHROUGH case PM_CONTEXT_DEF: case PM_CONTEXT_DEF_PARAMS: case PM_CONTEXT_DEF_ELSE: @@ -15714,7 +15355,7 @@ parse_block_exit(pm_parser_t *parser, pm_node_t *node) { // block exit to the list of exits for the expression, and // the node parsing will handle validating it instead. assert(parser->current_block_exits != NULL); - pm_node_list_append(parser->current_block_exits, node); + pm_node_list_append(parser->arena, parser->current_block_exits, node); return; case PM_CONTEXT_BEGIN_ELSE: case PM_CONTEXT_BEGIN_ENSURE: @@ -15805,7 +15446,7 @@ pop_block_exits(pm_parser_t *parser, pm_node_list_t *previous_block_exits) { // However, they could still become valid in a higher level context if // there is another list above this one. In this case we'll push all of // the block exits up to the previous list. - pm_node_list_concat(previous_block_exits, parser->current_block_exits); + pm_node_list_concat(parser->arena, previous_block_exits, parser->current_block_exits); parser->current_block_exits = previous_block_exits; } else { // If we did not match a trailing while/until and this was the last @@ -15815,11 +15456,11 @@ pop_block_exits(pm_parser_t *parser, pm_node_list_t *previous_block_exits) { } } -static inline pm_node_t * +static PRISM_INLINE pm_node_t * parse_predicate(pm_parser_t *parser, pm_binding_power_t binding_power, pm_context_t context, pm_token_t *then_keyword, uint16_t depth) { context_push(parser, PM_CONTEXT_PREDICATE); pm_diagnostic_id_t error_id = context == PM_CONTEXT_IF ? PM_ERR_CONDITIONAL_IF_PREDICATE : PM_ERR_CONDITIONAL_UNLESS_PREDICATE; - pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, error_id, (uint16_t) (depth + 1)); + pm_node_t *predicate = parse_value_expression(parser, binding_power, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, error_id, (uint16_t) (depth + 1)); // Predicates are closed by a term, a "then", or a term and then a "then". bool predicate_closed = accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); @@ -15837,13 +15478,13 @@ parse_predicate(pm_parser_t *parser, pm_binding_power_t binding_power, pm_contex return predicate; } -static inline pm_node_t * +static PRISM_INLINE pm_node_t * parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newline_index, bool if_after_else, uint16_t depth) { pm_node_list_t current_block_exits = { 0 }; pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); pm_token_t keyword = parser->previous; - pm_token_t then_keyword = not_provided(parser); + pm_token_t then_keyword = { 0 }; pm_node_t *predicate = parse_predicate(parser, PM_BINDING_POWER_MODIFIER, context, &then_keyword, (uint16_t) (depth + 1)); pm_statements_node_t *statements = NULL; @@ -15855,15 +15496,14 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); } - pm_token_t end_keyword = not_provided(parser); pm_node_t *parent = NULL; switch (context) { case PM_CONTEXT_IF: - parent = (pm_node_t *) pm_if_node_create(parser, &keyword, predicate, &then_keyword, statements, NULL, &end_keyword); + parent = UP(pm_if_node_create(parser, &keyword, predicate, NTOK2PTR(then_keyword), statements, NULL, NULL)); break; case PM_CONTEXT_UNLESS: - parent = (pm_node_t *) pm_unless_node_create(parser, &keyword, predicate, &then_keyword, statements); + parent = UP(pm_unless_node_create(parser, &keyword, predicate, NTOK2PTR(then_keyword), statements)); break; default: assert(false && "unreachable"); @@ -15877,7 +15517,7 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl if (context == PM_CONTEXT_IF) { while (match1(parser, PM_TOKEN_KEYWORD_ELSIF)) { if (parser_end_of_line_p(parser)) { - PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_KEYWORD_EOL); + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, &parser->current, PM_WARN_KEYWORD_EOL); } parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); @@ -15891,7 +15531,7 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl pm_accepts_block_stack_pop(parser); accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - pm_node_t *elsif = (pm_node_t *) pm_if_node_create(parser, &elsif_keyword, predicate, &then_keyword, statements, NULL, &end_keyword); + pm_node_t *elsif = UP(pm_if_node_create(parser, &elsif_keyword, predicate, NTOK2PTR(then_keyword), statements, NULL, NULL)); ((pm_if_node_t *) current)->subsequent = elsif; current = elsif; } @@ -15910,13 +15550,13 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); parser_warn_indentation_mismatch(parser, opening_newline_index, &else_keyword, false, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM_ELSE); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM_ELSE, &keyword); pm_else_node_t *else_node = pm_else_node_create(parser, &else_keyword, else_statements, &parser->previous); switch (context) { case PM_CONTEXT_IF: - ((pm_if_node_t *) current)->subsequent = (pm_node_t *) else_node; + ((pm_if_node_t *) current)->subsequent = UP(else_node); break; case PM_CONTEXT_UNLESS: ((pm_unless_node_t *) parent)->else_clause = else_node; @@ -15927,7 +15567,7 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl } } else { parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, if_after_else, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CONDITIONAL_TERM, &keyword); } // Set the appropriate end location for all of the nodes in the subtree. @@ -15939,12 +15579,12 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl while (recursing) { switch (PM_NODE_TYPE(current)) { case PM_IF_NODE: - pm_if_node_end_keyword_loc_set((pm_if_node_t *) current, &parser->previous); + pm_if_node_end_keyword_loc_set(parser, (pm_if_node_t *) current, &parser->previous); current = ((pm_if_node_t *) current)->subsequent; recursing = current != NULL; break; case PM_ELSE_NODE: - pm_else_node_end_keyword_loc_set((pm_else_node_t *) current, &parser->previous); + pm_else_node_end_keyword_loc_set(parser, (pm_else_node_t *) current, &parser->previous); recursing = false; break; default: { @@ -15956,7 +15596,7 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl break; } case PM_CONTEXT_UNLESS: - pm_unless_node_end_keyword_loc_set((pm_unless_node_t *) parent, &parser->previous); + pm_unless_node_end_keyword_loc_set(parser, (pm_unless_node_t *) parent, &parser->previous); break; default: assert(false && "unreachable"); @@ -15964,8 +15604,6 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl } pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - return parent; } @@ -15976,7 +15614,7 @@ parse_conditional(pm_parser_t *parser, pm_context_t context, size_t opening_newl #define PM_CASE_KEYWORD PM_TOKEN_KEYWORD___ENCODING__: case PM_TOKEN_KEYWORD___FILE__: case PM_TOKEN_KEYWORD___LINE__: \ case PM_TOKEN_KEYWORD_ALIAS: case PM_TOKEN_KEYWORD_AND: case PM_TOKEN_KEYWORD_BEGIN: case PM_TOKEN_KEYWORD_BEGIN_UPCASE: \ case PM_TOKEN_KEYWORD_BREAK: case PM_TOKEN_KEYWORD_CASE: case PM_TOKEN_KEYWORD_CLASS: case PM_TOKEN_KEYWORD_DEF: \ - case PM_TOKEN_KEYWORD_DEFINED: case PM_TOKEN_KEYWORD_DO: case PM_TOKEN_KEYWORD_DO_LOOP: case PM_TOKEN_KEYWORD_ELSE: \ + case PM_TOKEN_KEYWORD_DEFINED: case PM_TOKEN_KEYWORD_DO: case PM_TOKEN_KEYWORD_DO_BLOCK: case PM_TOKEN_KEYWORD_DO_LOOP: case PM_TOKEN_KEYWORD_ELSE: \ case PM_TOKEN_KEYWORD_ELSIF: case PM_TOKEN_KEYWORD_END: case PM_TOKEN_KEYWORD_END_UPCASE: case PM_TOKEN_KEYWORD_ENSURE: \ case PM_TOKEN_KEYWORD_FALSE: case PM_TOKEN_KEYWORD_FOR: case PM_TOKEN_KEYWORD_IF: case PM_TOKEN_KEYWORD_IN: \ case PM_TOKEN_KEYWORD_MODULE: case PM_TOKEN_KEYWORD_NEXT: case PM_TOKEN_KEYWORD_NIL: case PM_TOKEN_KEYWORD_NOT: \ @@ -16039,7 +15677,7 @@ PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int * If the encoding was explicitly set through the lexing process, then we need * to potentially mark the string's flags to indicate how to encode it. */ -static inline pm_node_flags_t +static PRISM_INLINE pm_node_flags_t parse_unescaped_encoding(const pm_parser_t *parser) { if (parser->explicit_encoding != NULL) { if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { @@ -16071,10 +15709,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { // "aaa #{bbb} #@ccc ddd" // ^^^^ ^ ^^^^ case PM_TOKEN_STRING_CONTENT: { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); + pm_node_t *node = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); pm_node_flag_set(node, parse_unescaped_encoding(parser)); parser_lex(parser); @@ -16101,7 +15736,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { pm_token_t opening = parser->previous; pm_statements_node_t *statements = NULL; - if (!match1(parser, PM_TOKEN_EMBEXPR_END)) { + if (!match3(parser, PM_TOKEN_EMBEXPR_END, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { pm_accepts_block_stack_push(parser, true); statements = parse_statements(parser, PM_CONTEXT_EMBEXPR, (uint16_t) (depth + 1)); pm_accepts_block_stack_pop(parser); @@ -16109,9 +15744,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { parser->brace_nesting = brace_nesting; lex_state_set(parser, state); - expect1(parser, PM_TOKEN_EMBEXPR_END, PM_ERR_EMBEXPR_END); - pm_token_t closing = parser->previous; // If this set of embedded statements only contains a single // statement, then Ruby does not consider it as a possible statement @@ -16120,7 +15753,7 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { pm_node_flag_unset(statements->body.nodes[0], PM_NODE_FLAG_NEWLINE); } - return (pm_node_t *) pm_embedded_statements_node_create(parser, &opening, statements, &closing); + return UP(pm_embedded_statements_node_create(parser, &opening, statements, &parser->previous)); } // Here the lexer has returned the beginning of an embedded variable. @@ -16145,42 +15778,42 @@ parse_string_part(pm_parser_t *parser, uint16_t depth) { // create a global variable read node. case PM_TOKEN_BACK_REFERENCE: parser_lex(parser); - variable = (pm_node_t *) pm_back_reference_read_node_create(parser, &parser->previous); + variable = UP(pm_back_reference_read_node_create(parser, &parser->previous)); break; // In this case an nth reference is being interpolated. We'll // create a global variable read node. case PM_TOKEN_NUMBERED_REFERENCE: parser_lex(parser); - variable = (pm_node_t *) pm_numbered_reference_read_node_create(parser, &parser->previous); + variable = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); break; // In this case a global variable is being interpolated. We'll // create a global variable read node. case PM_TOKEN_GLOBAL_VARIABLE: parser_lex(parser); - variable = (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous); + variable = UP(pm_global_variable_read_node_create(parser, &parser->previous)); break; // In this case an instance variable is being interpolated. // We'll create an instance variable read node. case PM_TOKEN_INSTANCE_VARIABLE: parser_lex(parser); - variable = (pm_node_t *) pm_instance_variable_read_node_create(parser, &parser->previous); + variable = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); break; // In this case a class variable is being interpolated. We'll // create a class variable read node. case PM_TOKEN_CLASS_VARIABLE: parser_lex(parser); - variable = (pm_node_t *) pm_class_variable_read_node_create(parser, &parser->previous); + variable = UP(pm_class_variable_read_node_create(parser, &parser->previous)); break; // We can hit here if we got an invalid token. In that case // we'll not attempt to lex this token and instead just return a // missing node. default: expect1(parser, PM_TOKEN_IDENTIFIER, PM_ERR_EMBVAR_INVALID); - variable = (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + variable = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); break; } - return (pm_node_t *) pm_embedded_variable_node_create(parser, &operator, variable); + return UP(pm_embedded_variable_node_create(parser, &operator, variable)); } default: parser_lex(parser); @@ -16208,18 +15841,16 @@ parse_operator_symbol_name(const pm_token_t *name) { static pm_node_t * parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_state_t next_state) { - pm_token_t closing = not_provided(parser); - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, opening, &parser->current, &closing); - + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, opening, &parser->current, NULL); const uint8_t *end = parse_operator_symbol_name(&parser->current); if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); parser_lex(parser); pm_string_shared_init(&symbol->unescaped, parser->previous.start, end); - pm_node_flag_set((pm_node_t *) symbol, PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING); + pm_node_flag_set(UP(symbol), PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING); - return (pm_node_t *) symbol; + return UP(symbol); } /** @@ -16253,13 +15884,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s break; } - pm_token_t closing = not_provided(parser); - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); - + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, NULL); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); - pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); - return (pm_node_t *) symbol; + return UP(symbol); } if (lex_mode->as.string.interpolation) { @@ -16267,10 +15896,13 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s if (match1(parser, PM_TOKEN_STRING_END)) { if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); parser_lex(parser); + pm_token_t content = { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->previous.start, + .end = parser->previous.start + }; - pm_token_t content = not_provided(parser); - pm_token_t closing = parser->previous; - return (pm_node_t *) pm_symbol_node_create(parser, &opening, &content, &closing); + return UP(pm_symbol_node_create(parser, &opening, &content, &parser->previous)); } // Now we can parse the first part of the symbol. @@ -16282,15 +15914,15 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s if (next_state != PM_LEX_STATE_NONE) lex_state_set(parser, next_state); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_INTERPOLATED); - return (pm_node_t *) pm_string_node_to_symbol_node(parser, (pm_string_node_t *) part, &opening, &parser->previous); + return UP(pm_string_node_to_symbol_node(parser, (pm_string_node_t *) part, &opening, &parser->previous)); } pm_interpolated_symbol_node_t *symbol = pm_interpolated_symbol_node_create(parser, &opening, NULL, &opening); - if (part) pm_interpolated_symbol_node_append(symbol, part); + if (part) pm_interpolated_symbol_node_append(parser->arena, symbol, part); while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_interpolated_symbol_node_append(symbol, part); + pm_interpolated_symbol_node_append(parser->arena, symbol, part); } } @@ -16301,8 +15933,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_INTERPOLATED); } - pm_interpolated_symbol_node_closing_loc_set(symbol, &parser->previous); - return (pm_node_t *) symbol; + pm_interpolated_symbol_node_closing_loc_set(parser, symbol, &parser->previous); + return UP(symbol); } pm_token_t content; @@ -16324,13 +15956,11 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s // interpolated string node, so that's what we'll do here. if (match1(parser, PM_TOKEN_STRING_CONTENT)) { pm_interpolated_symbol_node_t *symbol = pm_interpolated_symbol_node_create(parser, &opening, NULL, &opening); - pm_token_t bounds = not_provided(parser); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &unescaped)); + pm_interpolated_symbol_node_append(parser->arena, symbol, part); - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &unescaped); - pm_interpolated_symbol_node_append(symbol, part); - - part = (pm_node_t *) pm_string_node_create_unescaped(parser, &bounds, &parser->current, &bounds, &parser->current_string); - pm_interpolated_symbol_node_append(symbol, part); + part = UP(pm_string_node_create_unescaped(parser, NULL, &parser->current, NULL, &parser->current_string)); + pm_interpolated_symbol_node_append(parser->arena, symbol, part); if (next_state != PM_LEX_STATE_NONE) { lex_state_set(parser, next_state); @@ -16339,8 +15969,8 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s parser_lex(parser); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC); - pm_interpolated_symbol_node_closing_loc_set(symbol, &parser->previous); - return (pm_node_t *) symbol; + pm_interpolated_symbol_node_closing_loc_set(parser, symbol, &parser->previous); + return UP(symbol); } } else { content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end }; @@ -16357,34 +15987,29 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC); } - return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, false)); + return UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, false))); } /** * Parse an argument to undef which can either be a bare word, a symbol, a * constant, or an interpolated symbol. */ -static inline pm_node_t * +static PRISM_INLINE pm_node_t * parse_undef_argument(pm_parser_t *parser, uint16_t depth) { switch (parser->current.type) { - case PM_CASE_OPERATOR: { - const pm_token_t opening = not_provided(parser); - return parse_operator_symbol(parser, &opening, PM_LEX_STATE_NONE); - } + case PM_CASE_OPERATOR: + return parse_operator_symbol(parser, NULL, PM_LEX_STATE_NONE); case PM_CASE_KEYWORD: case PM_TOKEN_CONSTANT: case PM_TOKEN_IDENTIFIER: case PM_TOKEN_METHOD_NAME: { parser_lex(parser); - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); - + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, NULL, &parser->previous, NULL); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); - pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); - return (pm_node_t *) symbol; + return UP(symbol); } case PM_TOKEN_SYMBOL_BEGIN: { pm_lex_mode_t lex_mode = *parser->lex_modes.current; @@ -16394,7 +16019,7 @@ parse_undef_argument(pm_parser_t *parser, uint16_t depth) { } default: pm_parser_err_current(parser, PM_ERR_UNDEF_ARGUMENT); - return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); } } @@ -16404,13 +16029,11 @@ parse_undef_argument(pm_parser_t *parser, uint16_t depth) { * we need to set the lex state to PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM * between the first and second arguments. */ -static inline pm_node_t * +static PRISM_INLINE pm_node_t * parse_alias_argument(pm_parser_t *parser, bool first, uint16_t depth) { switch (parser->current.type) { - case PM_CASE_OPERATOR: { - const pm_token_t opening = not_provided(parser); - return parse_operator_symbol(parser, &opening, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE); - } + case PM_CASE_OPERATOR: + return parse_operator_symbol(parser, NULL, first ? PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM : PM_LEX_STATE_NONE); case PM_CASE_KEYWORD: case PM_TOKEN_CONSTANT: case PM_TOKEN_IDENTIFIER: @@ -16418,14 +16041,11 @@ parse_alias_argument(pm_parser_t *parser, bool first, uint16_t depth) { if (first) lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM); parser_lex(parser); - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); - + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, NULL, &parser->previous, NULL); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); - pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); + pm_node_flag_set(UP(symbol), parse_symbol_encoding(parser, &parser->previous, &symbol->unescaped, false)); - return (pm_node_t *) symbol; + return UP(symbol); } case PM_TOKEN_SYMBOL_BEGIN: { pm_lex_mode_t lex_mode = *parser->lex_modes.current; @@ -16435,16 +16055,16 @@ parse_alias_argument(pm_parser_t *parser, bool first, uint16_t depth) { } case PM_TOKEN_BACK_REFERENCE: parser_lex(parser); - return (pm_node_t *) pm_back_reference_read_node_create(parser, &parser->previous); + return UP(pm_back_reference_read_node_create(parser, &parser->previous)); case PM_TOKEN_NUMBERED_REFERENCE: parser_lex(parser); - return (pm_node_t *) pm_numbered_reference_read_node_create(parser, &parser->previous); + return UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); case PM_TOKEN_GLOBAL_VARIABLE: parser_lex(parser); - return (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous); + return UP(pm_global_variable_read_node_create(parser, &parser->previous)); default: pm_parser_err_current(parser, PM_ERR_ALIAS_ARGUMENT); - return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); } } @@ -16456,10 +16076,10 @@ static pm_node_t * parse_variable(pm_parser_t *parser) { pm_constant_id_t name_id = pm_parser_constant_id_token(parser, &parser->previous); int depth; - bool is_numbered_param = pm_token_is_numbered_parameter(parser->previous.start, parser->previous.end); + bool is_numbered_param = pm_token_is_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous)); if (!is_numbered_param && ((depth = pm_parser_local_depth_constant_id(parser, name_id)) != -1)) { - return (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, (uint32_t) depth, false); + return UP(pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, (uint32_t) depth, false)); } pm_scope_t *current_scope = parser->current_scope; @@ -16478,13 +16098,13 @@ parse_variable(pm_parser_t *parser) { parser->current_scope->parameters |= PM_SCOPE_PARAMETERS_NUMBERED_FOUND; } - pm_node_t *node = (pm_node_t *) pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0, false); - pm_node_list_append(¤t_scope->implicit_parameters, node); + pm_node_t *node = UP(pm_local_variable_read_node_create_constant_id(parser, &parser->previous, name_id, 0, false)); + pm_node_list_append(parser->arena, ¤t_scope->implicit_parameters, node); return node; - } else if ((parser->version != PM_OPTIONS_VERSION_CRUBY_3_3) && pm_token_is_it(parser->previous.start, parser->previous.end)) { - pm_node_t *node = (pm_node_t *) pm_it_local_variable_read_node_create(parser, &parser->previous); - pm_node_list_append(¤t_scope->implicit_parameters, node); + } else if ((parser->version >= PM_OPTIONS_VERSION_CRUBY_3_4) && pm_token_is_it(parser->previous.start, parser->previous.end)) { + pm_node_t *node = UP(pm_it_local_variable_read_node_create(parser, &parser->previous)); + pm_node_list_append(parser->arena, ¤t_scope->implicit_parameters, node); return node; } @@ -16507,9 +16127,9 @@ parse_variable_call(pm_parser_t *parser) { } pm_call_node_t *node = pm_call_node_variable_call_create(parser, &parser->previous); - pm_node_flag_set((pm_node_t *)node, flags); + pm_node_flag_set(UP(node), flags); - return (pm_node_t *) node; + return UP(node); } /** @@ -16517,7 +16137,7 @@ parse_variable_call(pm_parser_t *parser) { * parser. If it does not match a valid method definition name, then a missing * token is returned. */ -static inline pm_token_t +static PRISM_INLINE pm_token_t parse_method_definition_name(pm_parser_t *parser) { switch (parser->current.type) { case PM_CASE_KEYWORD: @@ -16526,7 +16146,7 @@ parse_method_definition_name(pm_parser_t *parser) { parser_lex(parser); return parser->previous; case PM_TOKEN_IDENTIFIER: - pm_refute_numbered_parameter(parser, parser->current.start, parser->current.end); + pm_refute_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current)); parser_lex(parser); return parser->previous; case PM_CASE_OPERATOR: @@ -16534,22 +16154,31 @@ parse_method_definition_name(pm_parser_t *parser) { parser_lex(parser); return parser->previous; default: - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_DEF_NAME, pm_token_type_human(parser->current.type)); - return (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->current.start, .end = parser->current.end }; + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_DEF_NAME, pm_token_str(parser->current.type)); + return (pm_token_t) { .type = 0, .start = parser->current.start, .end = parser->current.end }; } } static void -parse_heredoc_dedent_string(pm_string_t *string, size_t common_whitespace) { - // Get a reference to the string struct that is being held by the string - // node. This is the value we're going to actually manipulate. - pm_string_ensure_owned(string); +parse_heredoc_dedent_string(pm_arena_t *arena, pm_string_t *string, size_t common_whitespace) { + // Make a writable copy in the arena if the string isn't already writable. + // We keep a mutable pointer to the arena memory so we can memmove into it + // below without casting away const from the string's source field. + uint8_t *writable; + + if (string->type != PM_STRING_OWNED) { + size_t length = pm_string_length(string); + writable = (uint8_t *) pm_arena_memdup(arena, pm_string_source(string), length, PRISM_ALIGNOF(uint8_t)); + pm_string_constant_init(string, (const char *) writable, length); + } else { + writable = (uint8_t *) string->source; + } // Now get the bounds of the existing string. We'll use this as a // destination to move bytes into. We'll also use it for bounds checking // since we don't require that these strings be null terminated. size_t dest_length = pm_string_length(string); - const uint8_t *source_cursor = (uint8_t *) string->source; + const uint8_t *source_cursor = writable; const uint8_t *source_end = source_cursor + dest_length; // We're going to move bytes backward in the string when we get leading @@ -16573,11 +16202,24 @@ parse_heredoc_dedent_string(pm_string_t *string, size_t common_whitespace) { dest_length--; } - memmove((uint8_t *) string->source, source_cursor, (size_t) (source_end - source_cursor)); + memmove(writable, source_cursor, (size_t) (source_end - source_cursor)); string->length = dest_length; } /** + * If we end up trimming all of the whitespace from a node and it isn't + * part of a line continuation, then we'll drop it from the list entirely. + */ +static PRISM_INLINE bool +heredoc_dedent_discard_string_node(pm_parser_t *parser, pm_string_node_t *string_node) { + if (string_node->unescaped.length == 0) { + const uint8_t *cursor = parser->start + PM_LOCATION_START(&string_node->content_loc); + return pm_memchr(cursor, '\\', string_node->content_loc.length, parser->encoding_changed, parser->encoding) == NULL; + } + return false; +} + +/** * Take a heredoc node that is indented by a ~ and trim the leading whitespace. */ static void @@ -16587,8 +16229,7 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_list_t *nodes, size_t common_w bool dedent_next = true; // Iterate over all nodes, and trim whitespace accordingly. We're going to - // keep around two indices: a read and a write. If we end up trimming all of - // the whitespace from a node, then we'll drop it from the list entirely. + // keep around two indices: a read and a write. size_t write_index = 0; pm_node_t *node; @@ -16604,11 +16245,10 @@ parse_heredoc_dedent(pm_parser_t *parser, pm_node_list_t *nodes, size_t common_w pm_string_node_t *string_node = ((pm_string_node_t *) node); if (dedent_next) { - parse_heredoc_dedent_string(&string_node->unescaped, common_whitespace); + parse_heredoc_dedent_string(parser->arena, &string_node->unescaped, common_whitespace); } - if (string_node->unescaped.length == 0) { - pm_node_destroy(parser, node); + if (heredoc_dedent_discard_string_node(parser, string_node)) { } else { nodes->nodes[write_index++] = node; } @@ -16631,7 +16271,7 @@ parse_strings_empty_content(const uint8_t *location) { /** * Parse a set of strings that could be concatenated together. */ -static inline pm_node_t * +static PRISM_INLINE pm_node_t * parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint16_t depth) { assert(parser->current.type == PM_TOKEN_STRING_BEGIN); bool concating = false; @@ -16658,16 +16298,14 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 pm_string_node_t *string = pm_string_node_create(parser, &opening, &content, &parser->previous); pm_string_shared_init(&string->unescaped, content.start, content.end); - node = (pm_node_t *) string; + node = UP(string); } else if (accept1(parser, PM_TOKEN_LABEL_END)) { // If we get here, then we have an end of a label immediately // after a start. In that case we'll create an empty symbol // node. - pm_token_t content = parse_strings_empty_content(parser->previous.start); - pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &content, &parser->previous); - - pm_string_shared_init(&symbol->unescaped, content.start, content.end); - node = (pm_node_t *) symbol; + pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, NULL, &parser->previous); + pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.start); + node = UP(symbol); if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); } else if (!lex_interpolation) { @@ -16678,7 +16316,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 if (match1(parser, PM_TOKEN_EOF)) { unescaped = PM_STRING_EMPTY; - content = not_provided(parser); + content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->start, .end = parser->start }; } else { unescaped = parser->current_string; expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_EXPECT_STRING_CONTENT); @@ -16698,34 +16336,30 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 // be able to contain all of the parts. if (match1(parser, PM_TOKEN_STRING_CONTENT)) { pm_node_list_t parts = { 0 }; - - pm_token_t delimiters = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &delimiters, &content, &delimiters, &unescaped); - pm_node_list_append(&parts, part); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &unescaped)); + pm_node_list_append(parser->arena, &parts, part); do { - part = (pm_node_t *) pm_string_node_create_current_string(parser, &delimiters, &parser->current, &delimiters); - pm_node_list_append(&parts, part); + part = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + pm_node_list_append(parser->arena, &parts, part); parser_lex(parser); } while (match1(parser, PM_TOKEN_STRING_CONTENT)); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_EOF); - node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous); - - pm_node_list_free(&parts); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); } else if (accept1(parser, PM_TOKEN_LABEL_END)) { - node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true)); + node = UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true))); if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); } else if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_EOF); - node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); } else if (accept1(parser, PM_TOKEN_STRING_END)) { - node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped)); } else { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_type_human(parser->previous.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, PM_ERR_STRING_LITERAL_TERM, pm_token_str(parser->previous.type)); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; - node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + parser->previous.type = 0; + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped)); } } else if (match1(parser, PM_TOKEN_STRING_CONTENT)) { // In this case we've hit string content so we know the string @@ -16737,7 +16371,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 parser_lex(parser); if (match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); pm_node_flag_set(node, parse_unescaped_encoding(parser)); // Kind of odd behavior, but basically if we have an @@ -16747,43 +16381,38 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 if (!accept1(parser, PM_TOKEN_STRING_END)) { const uint8_t *location = parser->previous.end; if (location > parser->start && location[-1] == '\n') location--; - pm_parser_err(parser, location, location, PM_ERR_STRING_LITERAL_EOF); + pm_parser_err(parser, U32(location - parser->start), 0, PM_ERR_STRING_LITERAL_EOF); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } } else if (accept1(parser, PM_TOKEN_LABEL_END)) { - node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true)); + node = UP(pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &content, &unescaped, true))); if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); } else { // If we get here, then we have interpolation so we'll need // to create a string or symbol node with interpolation. pm_node_list_t parts = { 0 }; - pm_token_t string_opening = not_provided(parser); - pm_token_t string_closing = not_provided(parser); - - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &string_opening, &parser->previous, &string_closing, &unescaped); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, NULL, &parser->previous, NULL, &unescaped)); pm_node_flag_set(part, parse_unescaped_encoding(parser)); - pm_node_list_append(&parts, part); + pm_node_list_append(parser->arena, &parts, part); while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_node_list_append(&parts, part); + pm_node_list_append(parser->arena, &parts, part); } } if (accept1(parser, PM_TOKEN_LABEL_END)) { - node = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous); + node = UP(pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous)); if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); } else if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_STRING_INTERPOLATED_TERM); - node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current)); } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM); - node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); } - - pm_node_list_free(&parts); } } else { // If we get here, then the first part of the string is not plain @@ -16794,22 +16423,20 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 while (!match3(parser, PM_TOKEN_STRING_END, PM_TOKEN_LABEL_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_node_list_append(&parts, part); + pm_node_list_append(parser->arena, &parts, part); } } if (accept1(parser, PM_TOKEN_LABEL_END)) { - node = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous); + node = UP(pm_interpolated_symbol_node_create(parser, &opening, &parts, &parser->previous)); if (!label_allowed) pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_LABEL); } else if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_STRING_INTERPOLATED_TERM); - node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->current)); } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_INTERPOLATED_TERM); - node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous); + node = UP(pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous)); } - - pm_node_list_free(&parts); } if (current == NULL) { @@ -16839,14 +16466,12 @@ parse_strings(pm_parser_t *parser, pm_node_t *current, bool accepts_label, uint1 } concating = true; - pm_token_t bounds = not_provided(parser); - - pm_interpolated_string_node_t *container = pm_interpolated_string_node_create(parser, &bounds, NULL, &bounds); - pm_interpolated_string_node_append(container, current); - current = (pm_node_t *) container; + pm_interpolated_string_node_t *container = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, container, current); + current = UP(container); } - pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, node); + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, node); } } @@ -16868,12 +16493,12 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag static void parse_pattern_capture(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_constant_id_t capture, const pm_location_t *location) { // Skip this capture if it starts with an underscore. - if (*location->start == '_') return; + if (peek_at(parser, parser->start + location->start) == '_') return; if (pm_constant_id_list_includes(captures, capture)) { - pm_parser_err(parser, location->start, location->end, PM_ERR_PATTERN_CAPTURE_DUPLICATE); + pm_parser_err(parser, location->start, location->length, PM_ERR_PATTERN_CAPTURE_DUPLICATE); } else { - pm_constant_id_list_append(captures, capture); + pm_constant_id_list_append(parser->arena, captures, capture); } } @@ -16887,7 +16512,7 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures while (accept1(parser, PM_TOKEN_COLON_COLON)) { pm_token_t delimiter = parser->previous; expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); - node = (pm_node_t *) pm_constant_path_node_create(parser, node, &delimiter, &parser->previous); + node = UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); } // If there is a [ or ( that follows, then this is part of a larger pattern @@ -16908,7 +16533,7 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET, (uint16_t) (depth + 1)); accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET); + expect1_opening(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET, &opening); } closing = parser->previous; @@ -16920,7 +16545,7 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1)); accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &opening); } closing = parser->previous; @@ -16929,7 +16554,7 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures if (!inner) { // If there was no inner pattern, then we have something like Foo() or // Foo[]. In that case we'll create an array pattern with no requireds. - return (pm_node_t *) pm_array_pattern_node_constant_create(parser, node, &opening, &closing); + return UP(pm_array_pattern_node_constant_create(parser, node, &opening, &closing)); } // Now that we have the inner pattern, check to see if it's an array, find, @@ -16940,15 +16565,15 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures case PM_ARRAY_PATTERN_NODE: { pm_array_pattern_node_t *pattern_node = (pm_array_pattern_node_t *) inner; - if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { - pattern_node->base.location.start = node->location.start; - pattern_node->base.location.end = closing.end; + if (pattern_node->constant == NULL && pattern_node->opening_loc.length == 0) { + PM_NODE_START_SET_NODE(pattern_node, node); + PM_NODE_LENGTH_SET_TOKEN(parser, pattern_node, &closing); pattern_node->constant = node; - pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + pattern_node->opening_loc = TOK2LOC(parser, &opening); + pattern_node->closing_loc = TOK2LOC(parser, &closing); - return (pm_node_t *) pattern_node; + return UP(pattern_node); } break; @@ -16956,15 +16581,15 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures case PM_FIND_PATTERN_NODE: { pm_find_pattern_node_t *pattern_node = (pm_find_pattern_node_t *) inner; - if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { - pattern_node->base.location.start = node->location.start; - pattern_node->base.location.end = closing.end; + if (pattern_node->constant == NULL && pattern_node->opening_loc.length == 0) { + PM_NODE_START_SET_NODE(pattern_node, node); + PM_NODE_LENGTH_SET_TOKEN(parser, pattern_node, &closing); pattern_node->constant = node; - pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + pattern_node->opening_loc = TOK2LOC(parser, &opening); + pattern_node->closing_loc = TOK2LOC(parser, &closing); - return (pm_node_t *) pattern_node; + return UP(pattern_node); } break; @@ -16972,15 +16597,15 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures case PM_HASH_PATTERN_NODE: { pm_hash_pattern_node_t *pattern_node = (pm_hash_pattern_node_t *) inner; - if (pattern_node->constant == NULL && pattern_node->opening_loc.start == NULL) { - pattern_node->base.location.start = node->location.start; - pattern_node->base.location.end = closing.end; + if (pattern_node->constant == NULL && pattern_node->opening_loc.length == 0) { + PM_NODE_START_SET_NODE(pattern_node, node); + PM_NODE_LENGTH_SET_TOKEN(parser, pattern_node, &closing); pattern_node->constant = node; - pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + pattern_node->opening_loc = TOK2LOC(parser, &opening); + pattern_node->closing_loc = TOK2LOC(parser, &closing); - return (pm_node_t *) pattern_node; + return UP(pattern_node); } break; @@ -16993,8 +16618,8 @@ parse_pattern_constant_path(pm_parser_t *parser, pm_constant_id_list_t *captures // attaching its constant. In this case we'll create an array pattern and // attach our constant to it. pm_array_pattern_node_t *pattern_node = pm_array_pattern_node_constant_create(parser, node, &opening, &closing); - pm_array_pattern_node_requireds_append(pattern_node, inner); - return (pm_node_t *) pattern_node; + pm_array_pattern_node_requireds_append(parser->arena, pattern_node, inner); + return UP(pattern_node); } /** @@ -17010,21 +16635,20 @@ parse_pattern_rest(pm_parser_t *parser, pm_constant_id_list_t *captures) { // will check for that here. If they do, then we'll add it to the local // table since this pattern will cause it to become a local variable. if (accept1(parser, PM_TOKEN_IDENTIFIER)) { - pm_token_t identifier = parser->previous; - pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &identifier); + pm_constant_id_t constant_id = pm_parser_constant_id_token(parser, &parser->previous); int depth; if ((depth = pm_parser_local_depth_constant_id(parser, constant_id)) == -1) { - pm_parser_local_add(parser, constant_id, identifier.start, identifier.end, 0); + pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); } - parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&identifier)); - name = (pm_node_t *) pm_local_variable_target_node_create( + parse_pattern_capture(parser, captures, constant_id, &TOK2LOC(parser, &parser->previous)); + name = UP(pm_local_variable_target_node_create( parser, - &PM_LOCATION_TOKEN_VALUE(&identifier), + &TOK2LOC(parser, &parser->previous), constant_id, (uint32_t) (depth == -1 ? 0 : depth) - ); + )); } // Finally we can return the created node. @@ -17043,7 +16667,7 @@ parse_pattern_keyword_rest(pm_parser_t *parser, pm_constant_id_list_t *captures) pm_node_t *value = NULL; if (accept1(parser, PM_TOKEN_KEYWORD_NIL)) { - return (pm_node_t *) pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous); + return UP(pm_no_keywords_parameter_node_create(parser, &operator, &parser->previous)); } if (accept1(parser, PM_TOKEN_IDENTIFIER)) { @@ -17054,16 +16678,16 @@ parse_pattern_keyword_rest(pm_parser_t *parser, pm_constant_id_list_t *captures) pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); } - parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); - value = (pm_node_t *) pm_local_variable_target_node_create( + parse_pattern_capture(parser, captures, constant_id, &TOK2LOC(parser, &parser->previous)); + value = UP(pm_local_variable_target_node_create( parser, - &PM_LOCATION_TOKEN_VALUE(&parser->previous), + &TOK2LOC(parser, &parser->previous), constant_id, (uint32_t) (depth == -1 ? 0 : depth) - ); + )); } - return (pm_node_t *) pm_assoc_splat_node_create(parser, value, &operator); + return UP(pm_assoc_splat_node_create(parser, value, &operator)); } /** @@ -17100,22 +16724,24 @@ pm_slice_is_valid_local(const pm_parser_t *parser, const uint8_t *start, const u static pm_node_t * parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_symbol_node_t *key) { const pm_location_t *value_loc = &((pm_symbol_node_t *) key)->value_loc; + const uint8_t *start = parser->start + PM_LOCATION_START(value_loc); + const uint8_t *end = parser->start + PM_LOCATION_END(value_loc); - pm_constant_id_t constant_id = pm_parser_constant_id_location(parser, value_loc->start, value_loc->end); + pm_constant_id_t constant_id = pm_parser_constant_id_raw(parser, start, end); int depth = -1; - if (pm_slice_is_valid_local(parser, value_loc->start, value_loc->end)) { + if (pm_slice_is_valid_local(parser, start, end)) { depth = pm_parser_local_depth_constant_id(parser, constant_id); } else { - pm_parser_err(parser, key->base.location.start, key->base.location.end, PM_ERR_PATTERN_HASH_KEY_LOCALS); + pm_parser_err(parser, PM_NODE_START(key), PM_NODE_LENGTH(key), PM_ERR_PATTERN_HASH_KEY_LOCALS); - if ((value_loc->end > value_loc->start) && ((value_loc->end[-1] == '!') || (value_loc->end[-1] == '?'))) { - PM_PARSER_ERR_LOCATION_FORMAT(parser, value_loc, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE, (int) (value_loc->end - value_loc->start), (const char *) value_loc->start); + if ((end > start) && ((end[-1] == '!') || (end[-1] == '?'))) { + PM_PARSER_ERR_FORMAT(parser, value_loc->start, value_loc->length, PM_ERR_INVALID_LOCAL_VARIABLE_WRITE, (int) (end - start), (const char *) start); } } if (depth == -1) { - pm_parser_local_add(parser, constant_id, value_loc->start, value_loc->end, 0); + pm_parser_local_add(parser, constant_id, start, end, 0); } parse_pattern_capture(parser, captures, constant_id, value_loc); @@ -17126,7 +16752,7 @@ parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_constant_id_list_t *ca (uint32_t) (depth == -1 ? 0 : depth) ); - return (pm_node_t *) pm_implicit_node_create(parser, (pm_node_t *) target); + return UP(pm_implicit_node_create(parser, UP(target))); } /** @@ -17135,7 +16761,7 @@ parse_pattern_hash_implicit_value(pm_parser_t *parser, pm_constant_id_list_t *ca */ static void parse_pattern_hash_key(pm_parser_t *parser, pm_static_literals_t *keys, pm_node_t *node) { - if (pm_static_literals_add(&parser->newline_list, parser->start_line, keys, node, true) != NULL) { + if (pm_static_literals_add(&parser->line_offsets, parser->start, parser->start_line, keys, node, true) != NULL) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_KEY_DUPLICATE); } } @@ -17154,25 +16780,31 @@ parse_pattern_hash(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node case PM_NO_KEYWORDS_PARAMETER_NODE: rest = first_node; break; + case PM_INTERPOLATED_SYMBOL_NODE: case PM_SYMBOL_NODE: { - if (pm_symbol_node_label_p(first_node)) { - parse_pattern_hash_key(parser, &keys, first_node); + if (pm_symbol_node_label_p(parser, first_node)) { + if (PM_NODE_TYPE_P(first_node, PM_INTERPOLATED_SYMBOL_NODE)) { + pm_parser_err_node(parser, first_node, PM_ERR_PATTERN_HASH_KEY_INTERPOLATED); + } else { + parse_pattern_hash_key(parser, &keys, first_node); + } + pm_node_t *value; if (match8(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF)) { - // Otherwise, we will create an implicit local variable - // target for the value. - value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) first_node); + if (PM_NODE_TYPE_P(first_node, PM_SYMBOL_NODE)) { + value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) first_node); + } else { + value = UP(pm_error_recovery_node_create(parser, PM_NODE_END(first_node), 0)); + } } else { // Here we have a value for the first assoc in the list, so // we will parse it now. value = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY, (uint16_t) (depth + 1)); } - pm_token_t operator = not_provided(parser); - pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value); - - pm_node_list_append(&assocs, assoc); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, first_node, NULL, value)); + pm_node_list_append(parser->arena, &assocs, assoc); break; } } @@ -17184,11 +16816,10 @@ parse_pattern_hash(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node pm_diagnostic_id_t diag_id = PM_NODE_TYPE_P(first_node, PM_INTERPOLATED_SYMBOL_NODE) ? PM_ERR_PATTERN_HASH_KEY_INTERPOLATED : PM_ERR_PATTERN_HASH_KEY_LABEL; pm_parser_err_node(parser, first_node, diag_id); - pm_token_t operator = not_provided(parser); - pm_node_t *value = (pm_node_t *) pm_missing_node_create(parser, first_node->location.start, first_node->location.end); - pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, first_node, &operator, value); + pm_node_t *value = UP(pm_error_recovery_node_create(parser, PM_NODE_START(first_node), PM_NODE_LENGTH(first_node))); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, first_node, NULL, value)); - pm_node_list_append(&assocs, assoc); + pm_node_list_append(parser->arena, &assocs, assoc); break; } } @@ -17212,7 +16843,7 @@ parse_pattern_hash(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node rest = assoc; } else { pm_parser_err_node(parser, assoc, PM_ERR_PATTERN_EXPRESSION_AFTER_REST); - pm_node_list_append(&assocs, assoc); + pm_node_list_append(parser->arena, &assocs, assoc); } } else { pm_node_t *key; @@ -17222,36 +16853,43 @@ parse_pattern_hash(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node if (PM_NODE_TYPE_P(key, PM_INTERPOLATED_SYMBOL_NODE)) { pm_parser_err_node(parser, key, PM_ERR_PATTERN_HASH_KEY_INTERPOLATED); - } else if (!pm_symbol_node_label_p(key)) { + } else if (!pm_symbol_node_label_p(parser, key)) { pm_parser_err_node(parser, key, PM_ERR_PATTERN_LABEL_AFTER_COMMA); } + } else if (accept1(parser, PM_TOKEN_LABEL)) { + key = UP(pm_symbol_node_label_create(parser, &parser->previous)); } else { expect1(parser, PM_TOKEN_LABEL, PM_ERR_PATTERN_LABEL_AFTER_COMMA); - key = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous); + + pm_token_t label = { .type = PM_TOKEN_LABEL, .start = parser->previous.end, .end = parser->previous.end }; + key = UP(pm_symbol_node_create(parser, NULL, &label, NULL)); } parse_pattern_hash_key(parser, &keys, key); pm_node_t *value = NULL; if (match7(parser, PM_TOKEN_COMMA, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) key); + if (PM_NODE_TYPE_P(key, PM_SYMBOL_NODE)) { + value = parse_pattern_hash_implicit_value(parser, captures, (pm_symbol_node_t *) key); + } else { + value = UP(pm_error_recovery_node_create(parser, PM_NODE_END(key), 0)); + } } else { value = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_KEY, (uint16_t) (depth + 1)); } - pm_token_t operator = not_provided(parser); - pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, key, &operator, value); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, key, NULL, value)); if (rest != NULL) { pm_parser_err_node(parser, assoc, PM_ERR_PATTERN_EXPRESSION_AFTER_REST); } - pm_node_list_append(&assocs, assoc); + pm_node_list_append(parser->arena, &assocs, assoc); } } pm_hash_pattern_node_t *node = pm_hash_pattern_node_node_list_create(parser, &assocs, rest); - xfree(assocs.nodes); + // assocs.nodes is arena-allocated; no explicit free needed. pm_static_literals_free(&keys); return node; @@ -17273,13 +16911,13 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); } - parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); - return (pm_node_t *) pm_local_variable_target_node_create( + parse_pattern_capture(parser, captures, constant_id, &TOK2LOC(parser, &parser->previous)); + return UP(pm_local_variable_target_node_create( parser, - &PM_LOCATION_TOKEN_VALUE(&parser->previous), + &TOK2LOC(parser, &parser->previous), constant_id, (uint32_t) (depth == -1 ? 0 : depth) - ); + )); } case PM_TOKEN_BRACKET_LEFT_ARRAY: { pm_token_t opening = parser->current; @@ -17288,7 +16926,7 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm if (accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { // If we have an empty array pattern, then we'll just return a new // array pattern node. - return (pm_node_t *) pm_array_pattern_node_empty_create(parser, &opening, &parser->previous); + return UP(pm_array_pattern_node_empty_create(parser, &opening, &parser->previous)); } // Otherwise, we'll parse the inner pattern, then deal with it depending @@ -17296,34 +16934,34 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm pm_node_t *inner = parse_pattern(parser, captures, PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET, (uint16_t) (depth + 1)); accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET); + expect1_opening(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_PATTERN_TERM_BRACKET, &opening); pm_token_t closing = parser->previous; switch (PM_NODE_TYPE(inner)) { case PM_ARRAY_PATTERN_NODE: { pm_array_pattern_node_t *pattern_node = (pm_array_pattern_node_t *) inner; - if (pattern_node->opening_loc.start == NULL) { - pattern_node->base.location.start = opening.start; - pattern_node->base.location.end = closing.end; + if (pattern_node->opening_loc.length == 0) { + PM_NODE_START_SET_TOKEN(parser, pattern_node, &opening); + PM_NODE_LENGTH_SET_TOKEN(parser, pattern_node, &closing); - pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + pattern_node->opening_loc = TOK2LOC(parser, &opening); + pattern_node->closing_loc = TOK2LOC(parser, &closing); - return (pm_node_t *) pattern_node; + return UP(pattern_node); } break; } case PM_FIND_PATTERN_NODE: { pm_find_pattern_node_t *pattern_node = (pm_find_pattern_node_t *) inner; - if (pattern_node->opening_loc.start == NULL) { - pattern_node->base.location.start = opening.start; - pattern_node->base.location.end = closing.end; + if (pattern_node->opening_loc.length == 0) { + PM_NODE_START_SET_TOKEN(parser, pattern_node, &opening); + PM_NODE_LENGTH_SET_TOKEN(parser, pattern_node, &closing); - pattern_node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pattern_node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + pattern_node->opening_loc = TOK2LOC(parser, &opening); + pattern_node->closing_loc = TOK2LOC(parser, &closing); - return (pm_node_t *) pattern_node; + return UP(pattern_node); } break; @@ -17333,8 +16971,8 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm } pm_array_pattern_node_t *node = pm_array_pattern_node_empty_create(parser, &opening, &closing); - pm_array_pattern_node_requireds_append(node, inner); - return (pm_node_t *) node; + pm_array_pattern_node_requireds_append(parser->arena, node, inner); + return UP(node); } case PM_TOKEN_BRACE_LEFT: { bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; @@ -17354,19 +16992,19 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm switch (parser->current.type) { case PM_TOKEN_LABEL: parser_lex(parser); - first_node = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous); + first_node = UP(pm_symbol_node_label_create(parser, &parser->previous)); break; case PM_TOKEN_USTAR_STAR: first_node = parse_pattern_keyword_rest(parser, captures); break; case PM_TOKEN_STRING_BEGIN: - first_node = parse_expression(parser, PM_BINDING_POWER_MAX, false, true, PM_ERR_PATTERN_HASH_KEY_LABEL, (uint16_t) (depth + 1)); + first_node = parse_expression(parser, PM_BINDING_POWER_MAX, PM_PARSE_ACCEPTS_DO_BLOCK | PM_PARSE_ACCEPTS_LABEL, PM_ERR_PATTERN_HASH_KEY_LABEL, (uint16_t) (depth + 1)); break; default: { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_PATTERN_HASH_KEY, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_PATTERN_HASH_KEY, pm_token_str(parser->current.type)); parser_lex(parser); - first_node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + first_node = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous))); break; } } @@ -17374,18 +17012,18 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm node = parse_pattern_hash(parser, captures, first_node, (uint16_t) (depth + 1)); accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_PATTERN_TERM_BRACE); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_PATTERN_TERM_BRACE, &opening); pm_token_t closing = parser->previous; - node->base.location.start = opening.start; - node->base.location.end = closing.end; + PM_NODE_START_SET_TOKEN(parser, node, &opening); + PM_NODE_LENGTH_SET_TOKEN(parser, node, &closing); - node->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - node->closing_loc = PM_LOCATION_TOKEN_VALUE(&closing); + node->opening_loc = TOK2LOC(parser, &opening); + node->closing_loc = TOK2LOC(parser, &closing); } parser->pattern_matching_newlines = previous_pattern_matching_newlines; - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_UDOT_DOT: case PM_TOKEN_UDOT_DOT_DOT: { @@ -17396,21 +17034,27 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm // expression as the right side of the range. switch (parser->current.type) { case PM_CASE_PRIMITIVE: { - pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, false, false, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right); + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); + return UP(pm_range_node_create(parser, NULL, &operator, right)); } default: { pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE); - pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end); - return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right); + pm_node_t *right = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &operator), PM_TOKEN_LENGTH(&operator))); + return UP(pm_range_node_create(parser, NULL, &operator, right)); } } } case PM_CASE_PRIMITIVE: { - pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_MAX, false, true, diag_id, (uint16_t) (depth + 1)); + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_MAX, PM_PARSE_ACCEPTS_LABEL | PM_PARSE_ACCEPTS_DO_BLOCK, diag_id, (uint16_t) (depth + 1)); // If we found a label, we need to immediately return to the caller. - if (pm_symbol_node_label_p(node)) return node; + if (pm_symbol_node_label_p(parser, node)) return node; + + // Call nodes (arithmetic operations) are not allowed in patterns + if (PM_NODE_TYPE(node) == PM_CALL_NODE) { + pm_parser_err_node(parser, node, diag_id); + return UP(pm_error_recovery_node_create_unexpected(parser, node)); + } // Now that we have a primitive, we need to check if it's part of a range. if (accept2(parser, PM_TOKEN_DOT_DOT, PM_TOKEN_DOT_DOT_DOT)) { @@ -17421,11 +17065,11 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm // node. Otherwise, we'll create an endless range. switch (parser->current.type) { case PM_CASE_PRIMITIVE: { - pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, false, false, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_range_node_create(parser, node, &operator, right); + pm_node_t *right = parse_expression(parser, PM_BINDING_POWER_MAX, PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_PATTERN_EXPRESSION_AFTER_RANGE, (uint16_t) (depth + 1)); + return UP(pm_range_node_create(parser, node, &operator, right)); } default: - return (pm_node_t *) pm_range_node_create(parser, node, &operator, NULL); + return UP(pm_range_node_create(parser, node, &operator, NULL)); } } @@ -17440,44 +17084,44 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm switch (parser->current.type) { case PM_TOKEN_IDENTIFIER: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) parse_variable(parser); + pm_node_t *variable = UP(parse_variable(parser)); if (variable == NULL) { - PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, parser->previous, PM_ERR_NO_LOCAL_VARIABLE); - variable = (pm_node_t *) pm_local_variable_read_node_missing_create(parser, &parser->previous, 0); + PM_PARSER_ERR_TOKEN_FORMAT_CONTENT(parser, &parser->previous, PM_ERR_NO_LOCAL_VARIABLE); + variable = UP(pm_local_variable_read_node_missing_create(parser, &parser->previous, 0)); } - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_INSTANCE_VARIABLE: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) pm_instance_variable_read_node_create(parser, &parser->previous); + pm_node_t *variable = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_CLASS_VARIABLE: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) pm_class_variable_read_node_create(parser, &parser->previous); + pm_node_t *variable = UP(pm_class_variable_read_node_create(parser, &parser->previous)); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_GLOBAL_VARIABLE: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous); + pm_node_t *variable = UP(pm_global_variable_read_node_create(parser, &parser->previous)); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_NUMBERED_REFERENCE: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) pm_numbered_reference_read_node_create(parser, &parser->previous); + pm_node_t *variable = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_BACK_REFERENCE: { parser_lex(parser); - pm_node_t *variable = (pm_node_t *) pm_back_reference_read_node_create(parser, &parser->previous); + pm_node_t *variable = UP(pm_back_reference_read_node_create(parser, &parser->previous)); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } case PM_TOKEN_PARENTHESIS_LEFT: { bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; @@ -17486,19 +17130,19 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm pm_token_t lparen = parser->current; parser_lex(parser); - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN, (uint16_t) (depth + 1)); + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, PM_PARSE_ACCEPTS_DO_BLOCK | PM_PARSE_ACCEPTS_COMMAND_CALL, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN, (uint16_t) (depth + 1)); parser->pattern_matching_newlines = previous_pattern_matching_newlines; accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN); - return (pm_node_t *) pm_pinned_expression_node_create(parser, expression, &operator, &lparen, &parser->previous); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &lparen); + return UP(pm_pinned_expression_node_create(parser, expression, &operator, &lparen, &parser->previous)); } default: { // If we get here, then we have a pin operator followed by something // not understood. We'll create a missing node and return that. pm_parser_err_token(parser, &operator, PM_ERR_PATTERN_EXPRESSION_AFTER_PIN); - pm_node_t *variable = (pm_node_t *) pm_missing_node_create(parser, operator.start, operator.end); - return (pm_node_t *) pm_pinned_variable_node_create(parser, &operator, variable); + pm_node_t *variable = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &operator), PM_TOKEN_LENGTH(&operator))); + return UP(pm_pinned_variable_node_create(parser, &operator, variable)); } } } @@ -17509,31 +17153,56 @@ parse_pattern_primitive(pm_parser_t *parser, pm_constant_id_list_t *captures, pm expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); pm_constant_path_node_t *node = pm_constant_path_node_create(parser, NULL, &delimiter, &parser->previous); - return parse_pattern_constant_path(parser, captures, (pm_node_t *) node, (uint16_t) (depth + 1)); + return parse_pattern_constant_path(parser, captures, UP(node), (uint16_t) (depth + 1)); } case PM_TOKEN_CONSTANT: { pm_token_t constant = parser->current; parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_constant_read_node_create(parser, &constant); + pm_node_t *node = UP(pm_constant_read_node_create(parser, &constant)); return parse_pattern_constant_path(parser, captures, node, (uint16_t) (depth + 1)); } default: pm_parser_err_current(parser, diag_id); - return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); + } +} + +static bool +parse_pattern_alternation_error_each(const pm_node_t *node, void *data) { + switch (PM_NODE_TYPE(node)) { + case PM_LOCAL_VARIABLE_TARGET_NODE: { + pm_parser_t *parser = (pm_parser_t *) data; + pm_parser_err(parser, PM_NODE_START(node), PM_NODE_LENGTH(node), PM_ERR_PATTERN_CAPTURE_IN_ALTERNATIVE); + return false; + } + default: + return true; } } /** + * When we get here, we know that we already have a syntax error, because we + * know we have captured a variable and that we are in an alternation. + */ +static void +parse_pattern_alternation_error(pm_parser_t *parser, const pm_node_t *node) { + pm_visit_node(node, parse_pattern_alternation_error_each, parser); +} + +/** * Parse any number of primitives joined by alternation and ended optionally by * assignment. */ static pm_node_t * parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, pm_node_t *first_node, pm_diagnostic_id_t diag_id, uint16_t depth) { pm_node_t *node = first_node; + bool alternation = false; - while ((node == NULL) || accept1(parser, PM_TOKEN_PIPE)) { - pm_token_t operator = parser->previous; + while ((node == NULL) || (alternation = accept1(parser, PM_TOKEN_PIPE))) { + if (alternation && !PM_NODE_TYPE_P(node, PM_ALTERNATION_PATTERN_NODE) && captures->size) { + parse_pattern_alternation_error(parser, node); + } switch (parser->current.type) { case PM_TOKEN_IDENTIFIER: @@ -17545,41 +17214,47 @@ parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, p case PM_TOKEN_UDOT_DOT: case PM_TOKEN_UDOT_DOT_DOT: case PM_CASE_PRIMITIVE: { - if (node == NULL) { + if (!alternation) { node = parse_pattern_primitive(parser, captures, diag_id, (uint16_t) (depth + 1)); } else { + pm_token_t operator = parser->previous; pm_node_t *right = parse_pattern_primitive(parser, captures, PM_ERR_PATTERN_EXPRESSION_AFTER_PIPE, (uint16_t) (depth + 1)); - node = (pm_node_t *) pm_alternation_pattern_node_create(parser, node, right, &operator); + + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &operator)); } break; } case PM_TOKEN_PARENTHESIS_LEFT: case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: { + pm_token_t operator = parser->previous; pm_token_t opening = parser->current; parser_lex(parser); pm_node_t *body = parse_pattern(parser, captures, PM_PARSE_PATTERN_SINGLE, PM_ERR_PATTERN_EXPRESSION_AFTER_PAREN, (uint16_t) (depth + 1)); accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN); - pm_node_t *right = (pm_node_t *) pm_parentheses_node_create(parser, &opening, body, &parser->previous, 0); + expect1_opening(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_PATTERN_TERM_PAREN, &opening); + pm_node_t *right = UP(pm_parentheses_node_create(parser, &opening, body, &parser->previous, 0)); - if (node == NULL) { + if (!alternation) { node = right; } else { - node = (pm_node_t *) pm_alternation_pattern_node_create(parser, node, right, &operator); + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &operator)); } break; } default: { pm_parser_err_current(parser, diag_id); - pm_node_t *right = (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + pm_node_t *right = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); - if (node == NULL) { + if (!alternation) { node = right; } else { - node = (pm_node_t *) pm_alternation_pattern_node_create(parser, node, right, &operator); + if (captures->size) parse_pattern_alternation_error(parser, right); + node = UP(pm_alternation_pattern_node_create(parser, node, right, &parser->previous)); } break; @@ -17600,15 +17275,15 @@ parse_pattern_primitives(pm_parser_t *parser, pm_constant_id_list_t *captures, p pm_parser_local_add(parser, constant_id, parser->previous.start, parser->previous.end, 0); } - parse_pattern_capture(parser, captures, constant_id, &PM_LOCATION_TOKEN_VALUE(&parser->previous)); + parse_pattern_capture(parser, captures, constant_id, &TOK2LOC(parser, &parser->previous)); pm_local_variable_target_node_t *target = pm_local_variable_target_node_create( parser, - &PM_LOCATION_TOKEN_VALUE(&parser->previous), + &TOK2LOC(parser, &parser->previous), constant_id, (uint32_t) (depth == -1 ? 0 : depth) ); - node = (pm_node_t *) pm_capture_pattern_node_create(parser, node, target, &operator); + node = UP(pm_capture_pattern_node_create(parser, node, target, &operator)); } return node; @@ -17627,8 +17302,8 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag switch (parser->current.type) { case PM_TOKEN_LABEL: { parser_lex(parser); - pm_node_t *key = (pm_node_t *) pm_symbol_node_label_create(parser, &parser->previous); - node = (pm_node_t *) parse_pattern_hash(parser, captures, key, (uint16_t) (depth + 1)); + pm_node_t *key = UP(pm_symbol_node_label_create(parser, &parser->previous)); + node = UP(parse_pattern_hash(parser, captures, key, (uint16_t) (depth + 1))); if (!(flags & PM_PARSE_PATTERN_TOP)) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); @@ -17638,7 +17313,7 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag } case PM_TOKEN_USTAR_STAR: { node = parse_pattern_keyword_rest(parser, captures); - node = (pm_node_t *) parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1)); + node = UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); if (!(flags & PM_PARSE_PATTERN_TOP)) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); @@ -17651,8 +17326,8 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag // be dynamic symbols leading to hash patterns. node = parse_pattern_primitive(parser, captures, diag_id, (uint16_t) (depth + 1)); - if (pm_symbol_node_label_p(node)) { - node = (pm_node_t *) parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1)); + if (pm_symbol_node_label_p(parser, node)) { + node = UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); if (!(flags & PM_PARSE_PATTERN_TOP)) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_HASH_IMPLICIT); @@ -17667,7 +17342,7 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag case PM_TOKEN_USTAR: { if (flags & (PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI)) { parser_lex(parser); - node = (pm_node_t *) parse_pattern_rest(parser, captures); + node = UP(parse_pattern_rest(parser, captures)); leading_rest = true; break; } @@ -17680,8 +17355,8 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag // If we got a dynamic label symbol, then we need to treat it like the // beginning of a hash pattern. - if (pm_symbol_node_label_p(node)) { - return (pm_node_t *) parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1)); + if (pm_symbol_node_label_p(parser, node)) { + return UP(parse_pattern_hash(parser, captures, node, (uint16_t) (depth + 1))); } if ((flags & PM_PARSE_PATTERN_MULTI) && match1(parser, PM_TOKEN_COMMA)) { @@ -17689,20 +17364,20 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag // or a find pattern. We need to parse all of the patterns, put them // into a big list, and then determine which type of node we have. pm_node_list_t nodes = { 0 }; - pm_node_list_append(&nodes, node); + pm_node_list_append(parser->arena, &nodes, node); // Gather up all of the patterns into the list. while (accept1(parser, PM_TOKEN_COMMA)) { // Break early here in case we have a trailing comma. - if (match9(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE, PM_TOKEN_EOF,PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) { - node = (pm_node_t *) pm_implicit_rest_node_create(parser, &parser->previous); - pm_node_list_append(&nodes, node); + if (match7(parser, PM_TOKEN_KEYWORD_THEN, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_AND, PM_TOKEN_KEYWORD_OR)) { + node = UP(pm_implicit_rest_node_create(parser, &parser->previous)); + pm_node_list_append(parser->arena, &nodes, node); trailing_rest = true; break; } if (accept1(parser, PM_TOKEN_USTAR)) { - node = (pm_node_t *) parse_pattern_rest(parser, captures); + node = UP(parse_pattern_rest(parser, captures)); // If we have already parsed a splat pattern, then this is an // error. We will continue to parse the rest of the patterns, @@ -17716,7 +17391,7 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag node = parse_pattern_primitives(parser, captures, NULL, PM_ERR_PATTERN_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); } - pm_node_list_append(&nodes, node); + pm_node_list_append(parser->arena, &nodes, node); } // If the first pattern and the last pattern are rest patterns, then we @@ -17724,24 +17399,24 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag // are in between because we know we already added the appropriate // errors. Otherwise we will create an array pattern. if (leading_rest && PM_NODE_TYPE_P(nodes.nodes[nodes.size - 1], PM_SPLAT_NODE)) { - node = (pm_node_t *) pm_find_pattern_node_create(parser, &nodes); + node = UP(pm_find_pattern_node_create(parser, &nodes)); if (nodes.size == 2) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_FIND_MISSING_INNER); } } else { - node = (pm_node_t *) pm_array_pattern_node_node_list_create(parser, &nodes); + node = UP(pm_array_pattern_node_node_list_create(parser, &nodes)); if (leading_rest && trailing_rest) { pm_parser_err_node(parser, node, PM_ERR_PATTERN_ARRAY_MULTIPLE_RESTS); } } - xfree(nodes.nodes); + // nodes.nodes is arena-allocated; no explicit free needed. } else if (leading_rest) { // Otherwise, if we parsed a single splat pattern, then we know we have // an array pattern, so we can go ahead and create that node. - node = (pm_node_t *) pm_array_pattern_node_rest_create(parser, node); + node = UP(pm_array_pattern_node_rest_create(parser, node)); } return node; @@ -17752,29 +17427,33 @@ parse_pattern(pm_parser_t *parser, pm_constant_id_list_t *captures, uint8_t flag * from its start bounds. If it's a compound node, then we will recursively * apply this function to its value. */ -static inline void +static PRISM_INLINE void parse_negative_numeric(pm_node_t *node) { switch (PM_NODE_TYPE(node)) { case PM_INTEGER_NODE: { pm_integer_node_t *cast = (pm_integer_node_t *) node; cast->base.location.start--; + cast->base.location.length++; cast->value.negative = true; break; } case PM_FLOAT_NODE: { pm_float_node_t *cast = (pm_float_node_t *) node; cast->base.location.start--; + cast->base.location.length++; cast->value = -cast->value; break; } case PM_RATIONAL_NODE: { pm_rational_node_t *cast = (pm_rational_node_t *) node; cast->base.location.start--; + cast->base.location.length++; cast->numerator.negative = true; break; } case PM_IMAGINARY_NODE: node->location.start--; + node->location.length++; parse_negative_numeric(((pm_imaginary_node_t *) node)->numeric); break; default: @@ -17792,22 +17471,22 @@ static void pm_parser_err_prefix(pm_parser_t *parser, pm_diagnostic_id_t diag_id) { switch (diag_id) { case PM_ERR_HASH_KEY: { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, pm_token_type_human(parser->previous.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, diag_id, pm_token_str(parser->previous.type)); break; } case PM_ERR_HASH_VALUE: case PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR: { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, diag_id, pm_token_str(parser->current.type)); break; } case PM_ERR_UNARY_RECEIVER: { - const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_type_human(parser->current.type)); - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, diag_id, human, parser->previous.start[0]); + const char *human = (parser->current.type == PM_TOKEN_EOF ? "end-of-input" : pm_token_str(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, diag_id, human, parser->previous.start[0]); break; } case PM_ERR_UNARY_DISALLOWED: case PM_ERR_EXPECT_ARGUMENT: { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, diag_id, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, diag_id, pm_token_str(parser->current.type)); break; } default: @@ -17887,6 +17566,7 @@ parse_retry(pm_parser_t *parser, const pm_node_t *node) { case PM_CONTEXT_BEGIN: case PM_CONTEXT_BLOCK_BRACES: case PM_CONTEXT_BLOCK_KEYWORDS: + case PM_CONTEXT_BLOCK_PARAMETERS: case PM_CONTEXT_CASE_IN: case PM_CONTEXT_CASE_WHEN: case PM_CONTEXT_DEFAULT_PARAMS: @@ -17967,6 +17647,7 @@ parse_yield(pm_parser_t *parser, const pm_node_t *node) { case PM_CONTEXT_BLOCK_KEYWORDS: case PM_CONTEXT_BLOCK_ELSE: case PM_CONTEXT_BLOCK_ENSURE: + case PM_CONTEXT_BLOCK_PARAMETERS: case PM_CONTEXT_BLOCK_RESCUE: case PM_CONTEXT_CASE_IN: case PM_CONTEXT_CASE_WHEN: @@ -18003,67 +17684,1383 @@ parse_yield(pm_parser_t *parser, const pm_node_t *node) { } /** - * This struct is used to pass information between the regular expression parser - * and the error callback. + * Determine if a given call node looks like a "command", which means it has + * arguments but does not have parentheses. */ -typedef struct { - /** The parser that we are parsing the regular expression for. */ - pm_parser_t *parser; +static PRISM_INLINE bool +pm_call_node_command_p(const pm_call_node_t *node) { + return ( + (node->opening_loc.length == 0) && + (node->block == NULL || PM_NODE_TYPE_P(node->block, PM_BLOCK_ARGUMENT_NODE)) && + (node->arguments != NULL || node->block != NULL) + ); +} - /** The start of the regular expression. */ - const uint8_t *start; +/** + * Returns true if the given node is a command-style call (a method call without + * parentheses that has arguments), excluding operator calls (e.g., a + b) which + * satisfy the same structural criteria but are not commands. + */ +static bool +pm_command_call_value_p(const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + case PM_CALL_NODE: { + const pm_call_node_t *call = (const pm_call_node_t *) node; - /** The end of the regular expression. */ - const uint8_t *end; + // Command-style calls (e.g., foo bar, obj.foo bar). + // Attribute writes (e.g., a.b = 1) are not commands. + if (pm_call_node_command_p(call) && !PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_ATTRIBUTE_WRITE) && (call->receiver == NULL || call->call_operator_loc.length > 0)) { + return true; + } - /** - * Whether or not the source of the regular expression is shared. This - * impacts the location of error messages, because if it is shared then we - * can use the location directly and if it is not, then we use the bounds of - * the regular expression itself. - */ - bool shared; -} parse_regular_expression_error_data_t; + // A `!` or `not` prefix wrapping a command call (e.g., + // `!foo bar`, `not foo bar`) is also a command-call value. + if (call->receiver != NULL && call->arguments == NULL && call->opening_loc.length == 0 && call->call_operator_loc.length == 0) { + return pm_command_call_value_p(call->receiver); + } + + return false; + } + case PM_SUPER_NODE: { + const pm_super_node_t *cast = (const pm_super_node_t *) node; + return cast->lparen_loc.length == 0 && (cast->arguments != NULL || cast->block != NULL); + } + case PM_YIELD_NODE: { + const pm_yield_node_t *cast = (const pm_yield_node_t *) node; + return cast->lparen_loc.length == 0 && cast->arguments != NULL; + } + case PM_RESCUE_MODIFIER_NODE: + return pm_command_call_value_p(((const pm_rescue_modifier_node_t *) node)->expression); + case PM_DEF_NODE: { + const pm_def_node_t *cast = (const pm_def_node_t *) node; + if (cast->equal_loc.length > 0 && cast->body != NULL) { + const pm_node_t *body = cast->body; + if (PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE)) { + body = ((const pm_statements_node_t *) body)->body.nodes[((const pm_statements_node_t *) body)->body.size - 1]; + } + return pm_command_call_value_p(body); + } + return false; + } + default: + return false; + } +} /** - * This callback is called when the regular expression parser encounters a - * syntax error. + * Returns true if the given node is a block call: a command + * with a do-block, or any call chained (via `.`, `::`, `&.`) from such a node. + * Block calls can only be followed by call chaining, composition (and/or), and + * modifier operators. */ -static void -parse_regular_expression_error(const uint8_t *start, const uint8_t *end, const char *message, void *data) { - parse_regular_expression_error_data_t *callback_data = (parse_regular_expression_error_data_t *) data; - pm_location_t location; +static bool +pm_block_call_p(const pm_node_t *node) { + while (PM_NODE_TYPE_P(node, PM_CALL_NODE)) { + const pm_call_node_t *call = (const pm_call_node_t *) node; + if (call->opening_loc.length > 0) return false; + + // Root: command with do-block (e.g., `foo bar do end`). + if (call->arguments != NULL && call->block != NULL && PM_NODE_TYPE_P(call->block, PM_BLOCK_NODE)) { + return true; + } - if (callback_data->shared) { - location = (pm_location_t) { .start = start, .end = end }; + // Walk up the receiver chain (e.g., `foo bar do end.baz`). + if (call->call_operator_loc.length > 0 && call->receiver != NULL) { + node = call->receiver; + continue; + } + + return false; + } + + return false; +} + +/** + * Parse a case expression (the `case` keyword). This handles both case-when and + * case-in (pattern matching) forms. + */ +static pm_node_t * +parse_case(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t case_keyword = parser->previous; + pm_node_t *predicate = NULL; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + predicate = NULL; + } else if (match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_END)) { + predicate = NULL; + } else if (!token_begins_expression_p(parser->current.type)) { + predicate = NULL; + } else { + predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CASE_EXPRESSION_AFTER_CASE, (uint16_t) (depth + 1)); + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + } + + if (match1(parser, PM_TOKEN_KEYWORD_END)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + parser_lex(parser); + pop_block_exits(parser, previous_block_exits); + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + return UP(pm_case_node_create(parser, &case_keyword, predicate, &parser->previous)); + } + + /* At this point we can create a case node, though we don't yet know if it + * is a case-in or case-when node. */ + pm_node_t *node; + + if (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + pm_case_node_t *case_node = pm_case_node_create(parser, &case_keyword, predicate, NULL); + pm_static_literals_t literals = { 0 }; + + /* At this point we've seen a when keyword, so we know this is a + * case-when node. We will continue to parse the when nodes until we hit + * the end of the list. */ + while (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + parser_lex(parser); + + pm_token_t when_keyword = parser->previous; + pm_when_node_t *when_node = pm_when_node_create(parser, &when_keyword); + + do { + if (accept1(parser, PM_TOKEN_USTAR)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + + pm_splat_node_t *splat_node = pm_splat_node_create(parser, &operator, expression); + pm_when_node_conditions_append(parser->arena, when_node, UP(splat_node)); + + if (PM_NODE_TYPE_P(expression, PM_ERROR_RECOVERY_NODE)) break; + } else { + pm_node_t *condition = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CASE_EXPRESSION_AFTER_WHEN, (uint16_t) (depth + 1)); + pm_when_node_conditions_append(parser->arena, when_node, condition); + + /* If we found a missing node, then this is a syntax error + * and we should stop looping. */ + if (PM_NODE_TYPE_P(condition, PM_ERROR_RECOVERY_NODE)) break; + + /* If this is a string node, then we need to mark it as + * frozen because when clause strings are frozen. */ + if (PM_NODE_TYPE_P(condition, PM_STRING_NODE)) { + pm_node_flag_set(condition, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); + } else if (PM_NODE_TYPE_P(condition, PM_SOURCE_FILE_NODE)) { + pm_node_flag_set(condition, PM_NODE_FLAG_STATIC_LITERAL); + } + + pm_when_clause_static_literals_add(parser, &literals, condition); + } + } while (accept1(parser, PM_TOKEN_COMMA)); + + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_WHEN_DELIMITER); + pm_when_node_then_keyword_loc_set(parser, when_node, &parser->previous); + } + + if (!match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_CASE_WHEN, (uint16_t) (depth + 1)); + if (statements != NULL) { + pm_when_node_statements_set(when_node, statements); + } + } + + pm_case_node_condition_append(parser->arena, case_node, UP(when_node)); + } + + /* If we didn't parse any conditions (in or when) then we need to + * indicate that we have an error. */ + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + pm_static_literals_free(&literals); + node = UP(case_node); + } else { + pm_case_match_node_t *case_node = pm_case_match_node_create(parser, &case_keyword, predicate); + + /* If this is a case-match node (i.e., it is a pattern matching case + * statement) then we must have a predicate. */ + if (predicate == NULL) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MATCH_MISSING_PREDICATE); + } + + /* At this point we expect that we're parsing a case-in node. We will + * continue to parse the in nodes until we hit the end of the list. */ + while (match1(parser, PM_TOKEN_KEYWORD_IN)) { + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); + + bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; + parser->pattern_matching_newlines = true; + + lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); + parser->command_start = false; + parser_lex(parser); + + pm_token_t in_keyword = parser->previous; + + pm_constant_id_list_t captures = { 0 }; + pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); + + parser->pattern_matching_newlines = previous_pattern_matching_newlines; + + /* Since we're in the top-level of the case-in node we need to + * check for guard clauses in the form of `if` or `unless` + * statements. */ + if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_if_node_modifier_create(parser, pattern, &keyword, predicate)); + } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) { + pm_token_t keyword = parser->previous; + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); + pattern = UP(pm_unless_node_modifier_create(parser, pattern, &keyword, predicate)); + } + + /* Now we need to check for the terminator of the in node's pattern. + * It can be a newline or semicolon optionally followed by a `then` + * keyword. */ + pm_token_t then_keyword = { 0 }; + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { + then_keyword = parser->previous; + } + } else { + expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_IN_DELIMITER); + then_keyword = parser->previous; + } + + /* Now we can actually parse the statements associated with the in + * node. */ + pm_statements_node_t *statements; + if (match3(parser, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + statements = NULL; + } else { + statements = parse_statements(parser, PM_CONTEXT_CASE_IN, (uint16_t) (depth + 1)); + } + + /* Now that we have the full pattern and statements, we can create + * the node and attach it to the case node. */ + pm_node_t *condition = UP(pm_in_node_create(parser, pattern, statements, &in_keyword, NTOK2PTR(then_keyword))); + pm_case_match_node_condition_append(parser->arena, case_node, condition); + } + + /* If we didn't parse any conditions (in or when) then we need to + * indicate that we have an error. */ + if (case_node->conditions.size == 0) { + pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); + } + + node = UP(case_node); + } + + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + if (accept1(parser, PM_TOKEN_KEYWORD_ELSE)) { + pm_token_t else_keyword = parser->previous; + pm_else_node_t *else_node; + + if (!match1(parser, PM_TOKEN_KEYWORD_END)) { + else_node = pm_else_node_create(parser, &else_keyword, parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)), &parser->current); + } else { + else_node = pm_else_node_create(parser, &else_keyword, NULL, &parser->current); + } + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_else_clause_set((pm_case_node_t *) node, else_node); + } else { + pm_case_match_node_else_clause_set((pm_case_match_node_t *) node, else_node); + } + } + + parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CASE_TERM, &case_keyword); + + if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { + pm_case_node_end_keyword_loc_set(parser, (pm_case_node_t *) node, &parser->previous); } else { - location = (pm_location_t) { .start = callback_data->start, .end = callback_data->end }; + pm_case_match_node_end_keyword_loc_set(parser, (pm_case_match_node_t *) node, &parser->previous); } - PM_PARSER_ERR_FORMAT(callback_data->parser, location.start, location.end, PM_ERR_REGEXP_PARSE_ERROR, message); + pop_block_exits(parser, previous_block_exits); + return node; } /** - * Parse the errors for the regular expression and add them to the parser. + * Parse a class definition expression (the `class` keyword). This handles both + * regular class definitions and singleton class definitions (`class << expr`). */ -static void -parse_regular_expression_errors(pm_parser_t *parser, pm_regular_expression_node_t *node) { - const pm_string_t *unescaped = &node->unescaped; - parse_regular_expression_error_data_t error_data = { - .parser = parser, - .start = node->base.location.start, - .end = node->base.location.end, - .shared = unescaped->type == PM_STRING_SHARED - }; +static pm_node_t * +parse_class(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + + pm_token_t class_keyword = parser->previous; + pm_do_loop_stack_push(parser, false); + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + if (accept1(parser, PM_TOKEN_LESS_LESS)) { + pm_token_t operator = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS, (uint16_t) (depth + 1)); + + pm_parser_scope_push(parser, true); + if (!match2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER, pm_token_str(parser->current.type)); + } + + pm_node_t *statements = NULL; + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_SCLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_SCLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + flush_block_exits(parser, previous_block_exits); + return UP(pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous)); + } + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CLASS_NAME, (uint16_t) (depth + 1)); + pm_token_t name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME); + } + + pm_token_t inheritance_operator = { 0 }; + pm_node_t *superclass; + + if (match1(parser, PM_TOKEN_LESS)) { + inheritance_operator = parser->current; + lex_state_set(parser, PM_LEX_STATE_BEG); + + parser->command_start = true; + parser_lex(parser); + + superclass = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CLASS_SUPERCLASS, (uint16_t) (depth + 1)); + } else { + superclass = NULL; + } + + pm_parser_scope_push(parser, true); + + if (inheritance_operator.start != NULL) { + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_CLASS, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_CLASS, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); + } + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM, &class_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD); + } - pm_regexp_parse(parser, pm_string_source(unescaped), pm_string_length(unescaped), PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED), NULL, NULL, parse_regular_expression_error, &error_data); + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + pm_do_loop_stack_pop(parser); + + if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) { + pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME); + if (!PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); + } + } + + pop_block_exits(parser, previous_block_exits); + return UP(pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, NTOK2PTR(inheritance_operator), superclass, statements, &parser->previous)); +} + +/** + * Parse a method definition expression (the `def` keyword). + */ +static pm_node_t * +parse_def(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, uint16_t depth) { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + pm_token_t def_keyword = parser->current; + size_t opening_newline_index = token_newline_index(parser); + + pm_node_t *receiver = NULL; + pm_token_t operator = { 0 }; + pm_token_t name; + + /* This context is necessary for lexing `...` in a bare params correctly. It + * must be pushed before lexing the first param, so it is here. */ + context_push(parser, PM_CONTEXT_DEF_PARAMS); + parser_lex(parser); + + /* This will be false if the method name is not a valid identifier but could + * be followed by an operator. */ + bool valid_name = true; + + switch (parser->current.type) { + case PM_CASE_OPERATOR: + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_ENDFN); + parser_lex(parser); + + name = parser->previous; + break; + case PM_TOKEN_IDENTIFIER: { + parser_lex(parser); + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + receiver = parse_variable_call(parser); + + pm_parser_scope_push(parser, true); + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + + operator = parser->previous; + name = parse_method_definition_name(parser); + } else { + pm_refute_numbered_parameter(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous)); + pm_parser_scope_push(parser, true); + + name = parser->previous; + } + + break; + } + case PM_TOKEN_INSTANCE_VARIABLE: + case PM_TOKEN_CLASS_VARIABLE: + case PM_TOKEN_GLOBAL_VARIABLE: + valid_name = false; + PRISM_FALLTHROUGH + case PM_TOKEN_CONSTANT: + case PM_TOKEN_KEYWORD_NIL: + case PM_TOKEN_KEYWORD_SELF: + case PM_TOKEN_KEYWORD_TRUE: + case PM_TOKEN_KEYWORD_FALSE: + case PM_TOKEN_KEYWORD___FILE__: + case PM_TOKEN_KEYWORD___LINE__: + case PM_TOKEN_KEYWORD___ENCODING__: { + pm_parser_scope_push(parser, true); + parser_lex(parser); + + pm_token_t identifier = parser->previous; + + if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { + lex_state_set(parser, PM_LEX_STATE_FNAME); + parser_lex(parser); + operator = parser->previous; + + switch (identifier.type) { + case PM_TOKEN_CONSTANT: + receiver = UP(pm_constant_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_INSTANCE_VARIABLE: + receiver = UP(pm_instance_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_CLASS_VARIABLE: + receiver = UP(pm_class_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_GLOBAL_VARIABLE: + receiver = UP(pm_global_variable_read_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_NIL: + receiver = UP(pm_nil_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_SELF: + receiver = UP(pm_self_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_TRUE: + receiver = UP(pm_true_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD_FALSE: + receiver = UP(pm_false_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___FILE__: + receiver = UP(pm_source_file_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___LINE__: + receiver = UP(pm_source_line_node_create(parser, &identifier)); + break; + case PM_TOKEN_KEYWORD___ENCODING__: + receiver = UP(pm_source_encoding_node_create(parser, &identifier)); + break; + default: + break; + } + + name = parse_method_definition_name(parser); + } else { + if (!valid_name) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &identifier, PM_ERR_DEF_NAME, pm_token_str(identifier.type)); + } + + name = identifier; + } + break; + } + case PM_TOKEN_PARENTHESIS_LEFT: { + /* The current context is `PM_CONTEXT_DEF_PARAMS`, however the inner + * expression of this parenthesis should not be processed under this + * context. Thus, the context is popped here. */ + context_pop(parser); + parser_lex(parser); + + pm_token_t lparen = parser->previous; + pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_DEF_RECEIVER, (uint16_t) (depth + 1)); + + accept1(parser, PM_TOKEN_NEWLINE); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + pm_token_t rparen = parser->previous; + + lex_state_set(parser, PM_LEX_STATE_FNAME); + expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM); + + operator = parser->previous; + receiver = UP(pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0)); + + /* To push `PM_CONTEXT_DEF_PARAMS` again is for the same reason as + * described the above. */ + pm_parser_scope_push(parser, true); + context_push(parser, PM_CONTEXT_DEF_PARAMS); + name = parse_method_definition_name(parser); + break; + } + default: + pm_parser_scope_push(parser, true); + name = parse_method_definition_name(parser); + break; + } + + pm_token_t lparen = { 0 }; + pm_token_t rparen = { 0 }; + pm_parameters_node_t *params; + + bool accept_endless_def = true; + switch (parser->current.type) { + case PM_TOKEN_PARENTHESIS_LEFT: { + parser_lex(parser); + lparen = parser->previous; + + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + params = NULL; + } else { + /* https://bugs.ruby-lang.org/issues/19107 */ + bool allow_trailing_comma = parser->version >= PM_OPTIONS_VERSION_CRUBY_4_1; + params = parse_parameters( + parser, + PM_BINDING_POWER_DEFINED, + true, + allow_trailing_comma, + true, + true, + false, + PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, + (uint16_t) (depth + 1) + ); + } + + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + + context_pop(parser); + if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_DEF_PARAMS_TERM_PAREN, pm_token_str(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = 0; + } + + rparen = parser->previous; + break; + } + case PM_CASE_PARAMETER: { + /* If we're about to lex a label, we need to add the label state to + * make sure the next newline is ignored. */ + if (parser->current.type == PM_TOKEN_LABEL) { + lex_state_set(parser, parser->lex_state | PM_LEX_STATE_LABEL); + } + + params = parse_parameters( + parser, + PM_BINDING_POWER_DEFINED, + false, + false, + true, + true, + false, + PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES, + (uint16_t) (depth + 1) + ); + + /* Reject `def * = 1` and similar. We have to specifically check for + * them because they create ambiguity with optional arguments. */ + accept_endless_def = false; + + context_pop(parser); + break; + } + default: { + params = NULL; + context_pop(parser); + break; + } + } + + pm_node_t *statements = NULL; + pm_token_t equal = { 0 }; + pm_token_t end_keyword = { 0 }; + + if (accept1(parser, PM_TOKEN_EQUAL)) { + if (token_is_setter_name(&name)) { + pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER); + } + if (!accept_endless_def) { + pm_parser_err_previous(parser, PM_ERR_DEF_ENDLESS_PARAMETERS); + } + if ( + parser->current_context->context == PM_CONTEXT_DEFAULT_PARAMS && + parser->current_context->prev->context == PM_CONTEXT_BLOCK_PARAMETERS + ) { + PM_PARSER_ERR_FORMAT(parser, PM_TOKEN_START(parser, &def_keyword), PM_TOKENS_LENGTH(&def_keyword, &parser->previous), PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE, "endless method definition"); + } + equal = parser->previous; + + context_push(parser, PM_CONTEXT_DEF); + pm_do_loop_stack_push(parser, false); + statements = UP(pm_statements_node_create(parser)); + + uint8_t allow_flags; + if (parser->version >= PM_OPTIONS_VERSION_CRUBY_4_0) { + allow_flags = flags & PM_PARSE_ACCEPTS_COMMAND_CALL; + } else { + /* Allow `def foo = puts "Hello"` but not + * `private def foo = puts "Hello"` */ + allow_flags = (binding_power == PM_BINDING_POWER_ASSIGNMENT || binding_power < PM_BINDING_POWER_COMPOSITION) ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0; + } + + /* Inside a def body, we push true onto the accepts_block_stack so that + * `do` is lexed as PM_TOKEN_KEYWORD_DO (which can only start a block + * for primary-level constructs, not commands). During command argument + * parsing, the stack is pushed to false, causing `do` to be lexed as + * PM_TOKEN_KEYWORD_DO_BLOCK, which is not consumed inside the endless + * def body and instead left for the outer context. */ + pm_accepts_block_stack_push(parser, true); + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, allow_flags | PM_PARSE_IN_ENDLESS_DEF, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1)); + pm_accepts_block_stack_pop(parser); + + /* If an unconsumed PM_TOKEN_KEYWORD_DO follows the body, it is an error + * (e.g., `def f = 1 do end`). PM_TOKEN_KEYWORD_DO_BLOCK is + * intentionally not caught here — it should bubble up to the outer + * context (e.g., `private def f = puts "Hello" do end` where the block + * attaches to `private`). */ + if (accept1(parser, PM_TOKEN_KEYWORD_DO)) { + pm_block_node_t *block = parse_block(parser, (uint16_t) (depth + 1)); + pm_parser_err_node(parser, UP(block), PM_ERR_DEF_ENDLESS_DO_BLOCK); + } + + if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { + context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); + + pm_token_t rescue_keyword = parser->previous; + + /* In the Ruby grammar, the rescue value of an endless method + * command excludes and/or and in/=>. */ + pm_node_t *value = parse_expression(parser, PM_BINDING_POWER_MATCH + 1, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + context_pop(parser); + + statement = UP(pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value)); + } + + /* A nested endless def whose body is a command call (e.g., + * `def f = def g = foo bar`) is a command assignment and cannot appear + * as a def body. */ + if (PM_NODE_TYPE_P(statement, PM_DEF_NODE) && pm_command_call_value_p(statement)) { + PM_PARSER_ERR_NODE_FORMAT(parser, statement, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + + pm_statements_node_body_append(parser, (pm_statements_node_t *) statements, statement, false); + pm_do_loop_stack_pop(parser); + context_pop(parser); + } else { + if (lparen.start == NULL) { + lex_state_set(parser, PM_LEX_STATE_BEG); + parser->command_start = true; + expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_DEF_PARAMS_TERM); + } else { + accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); + } + + pm_accepts_block_stack_push(parser, true); + pm_do_loop_stack_push(parser, false); + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_DEF, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &def_keyword, def_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_DEF, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &def_keyword, false, false); + } + + pm_accepts_block_stack_pop(parser); + pm_do_loop_stack_pop(parser); + + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_DEF_TERM, &def_keyword); + end_keyword = parser->previous; + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + pm_parser_scope_pop(parser); + + /* If the final character is `@` as is the case when defining methods to + * override the unary operators, we should ignore the @ in the same way we + * do for symbols. */ + pm_constant_id_t name_id = pm_parser_constant_id_raw(parser, name.start, parse_operator_symbol_name(&name)); + + flush_block_exits(parser, previous_block_exits); + + return UP(pm_def_node_create( + parser, + name_id, + &name, + receiver, + params, + statements, + &locals, + &def_keyword, + NTOK2PTR(operator), + NTOK2PTR(lparen), + NTOK2PTR(rparen), + NTOK2PTR(equal), + NTOK2PTR(end_keyword) + )); +} + +/** + * Parse a module definition expression (the `module` keyword). + */ +static pm_node_t * +parse_module(pm_parser_t *parser, uint8_t flags, uint16_t depth) { + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + size_t opening_newline_index = token_newline_index(parser); + parser_lex(parser); + pm_token_t module_keyword = parser->previous; + + pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_MODULE_NAME, (uint16_t) (depth + 1)); + pm_token_t name; + + /* If we can recover from a syntax error that occurred while parsing the + * name of the module, then we'll handle that here. */ + if (PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + pop_block_exits(parser, previous_block_exits); + + pm_token_t missing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + return UP(pm_module_node_create(parser, NULL, &module_keyword, constant_path, &missing, NULL, &missing)); + } + + while (accept1(parser, PM_TOKEN_COLON_COLON)) { + pm_token_t double_colon = parser->previous; + + expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); + constant_path = UP(pm_constant_path_node_create(parser, constant_path, &double_colon, &parser->previous)); + } + + /* Here we retrieve the name of the module. If it wasn't a constant, then + * it's possible that `module foo` was passed, which is a syntax error. We + * handle that here as well. */ + name = parser->previous; + if (name.type != PM_TOKEN_CONSTANT) { + pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME); + } + + if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE) && !PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !PM_NODE_TYPE_P(constant_path, PM_ERROR_RECOVERY_NODE)) { + constant_path = UP(pm_error_recovery_node_create_unexpected(parser, constant_path)); + } + + pm_parser_scope_push(parser, true); + accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE); + pm_node_t *statements = NULL; + + if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { + pm_accepts_block_stack_push(parser, true); + statements = UP(parse_statements(parser, PM_CONTEXT_MODULE, (uint16_t) (depth + 1))); + pm_accepts_block_stack_pop(parser); + } + + if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { + assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); + statements = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &module_keyword, module_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_MODULE, (uint16_t) (depth + 1))); + } else { + parser_warn_indentation_mismatch(parser, opening_newline_index, &module_keyword, false, false); + } + + pm_constant_id_list_t locals; + pm_locals_order(parser, &parser->current_scope->locals, &locals, false); + + pm_parser_scope_pop(parser); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM, &module_keyword); + + if (context_def_p(parser)) { + pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD); + } + + pop_block_exits(parser, previous_block_exits); + + return UP(pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous)); +} + +/** + * Parse an interpolated word array literal (`%W[...]`). + */ +static pm_node_t * +parse_string_array(pm_parser_t *parser, uint16_t depth) { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + /* This is the current node that we are parsing that will be added to the + * list of elements. */ + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + /* Reset the explicit encoding if we hit a separator since each + * element can have its own encoding. */ + parser->explicit_encoding = NULL; + + if (current == NULL) { + /* If we hit a separator before we have any content, then we + * don't need to do anything. */ + } else { + /* If we hit a separator after we've hit content, then we + * need to append that content to the list and reset the + * current node. */ + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + pm_node_flag_set(string, parse_unescaped_encoding(parser)); + parser_lex(parser); + + if (current == NULL) { + /* If we hit content and the current node is NULL, then this + * is the first string content we've seen. In that case + * we're going to create a new string node and set that to + * the current. */ + current = string; + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + /* If we hit string content and the current node is an + * interpolated string, then we need to append the string + * content to the list of child nodes. */ + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit string content and the current node is a string + * node, then we need to convert the current node into an + * interpolated string and add the string content to the + * list of child nodes. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + pm_interpolated_string_node_append(parser, interpolated, string); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + if (current == NULL) { + /* If we hit an embedded variable and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit an embedded variable and the current node is a + * string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + current = UP(interpolated); + } else { + /* If we hit an embedded variable and the current node is an + * interpolated string, then we'll just add the embedded + * variable. */ + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + if (current == NULL) { + /* If we hit an embedded expression and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_string_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + /* If we hit an embedded expression and the current node is + * a string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + /* If we hit an embedded expression and the current node is + * an interpolated string, then we'll just continue on. */ + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, part); + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + /* If we have a current node, then we need to append it to the list. */ + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_UPPER_TERM); + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); + } + + pm_array_node_close_set(parser, array, &closing); + return UP(array); +} + +/** + * Parse an interpolated symbol array literal (`%I[...]`). + */ +static pm_node_t * +parse_symbol_array(pm_parser_t *parser, uint16_t depth) { + parser_lex(parser); + pm_token_t opening = parser->previous; + pm_array_node_t *array = pm_array_node_create(parser, &opening); + + /* This is the current node that we are parsing that will be added to the + * list of elements. */ + pm_node_t *current = NULL; + + while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { + switch (parser->current.type) { + case PM_TOKEN_WORDS_SEP: { + if (current == NULL) { + /* If we hit a separator before we have any content, then we + * don't need to do anything. */ + } else { + /* If we hit a separator after we've hit content, then we + * need to append that content to the list and reset the + * current node. */ + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } + + parser_lex(parser); + break; + } + case PM_TOKEN_STRING_CONTENT: { + if (current == NULL) { + /* If we hit content and the current node is NULL, then this + * is the first string content we've seen. In that case + * we're going to create a new string node and set that to + * the current. */ + current = UP(pm_symbol_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + /* If we hit string content and the current node is an + * interpolated string, then we need to append the string + * content to the list of child nodes. */ + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit string content and the current node is a symbol + * node, then we need to convert the current node into an + * interpolated string and add the string content to the + * list of child nodes. */ + pm_symbol_node_t *cast = (pm_symbol_node_t *) current; + pm_token_t content = { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->start + cast->value_loc.start, + .end = parser->start + cast->value_loc.start + cast->value_loc.length + }; + + pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &cast->unescaped)); + pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, NULL, &parser->previous, NULL)); + parser_lex(parser); + + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + pm_interpolated_symbol_node_append(parser->arena, interpolated, first_string); + pm_interpolated_symbol_node_append(parser->arena, interpolated, second_string); + + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + + break; + } + case PM_TOKEN_EMBVAR: { + bool start_location_set = false; + if (current == NULL) { + /* If we hit an embedded variable and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit an embedded variable and the current node is a + * string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(parser->arena, interpolated, current); + PM_NODE_START_SET_NODE(interpolated, current); + start_location_set = true; + current = UP(interpolated); + } else { + /* If we hit an embedded variable and the current node is an + * interpolated string, then we'll just add the embedded + * variable. */ + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + PM_NODE_START_SET_NODE(current, part); + } + break; + } + case PM_TOKEN_EMBEXPR_BEGIN: { + bool start_location_set = false; + if (current == NULL) { + /* If we hit an embedded expression and the current node is + * NULL, then this is the start of a new string. We'll set + * the current node to a new interpolated string. */ + current = UP(pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL)); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + /* If we hit an embedded expression and the current node is + * a string node, then we'll convert the current into an + * interpolated string and add the string node to the list + * of parts. */ + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + + current = UP(pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current)); + pm_interpolated_symbol_node_append(parser->arena, interpolated, current); + PM_NODE_START_SET_NODE(interpolated, current); + start_location_set = true; + current = UP(interpolated); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + /* If we hit an embedded expression and the current node is + * an interpolated string, then we'll just continue on. */ + } else { + assert(false && "unreachable"); + } + + pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, part); + if (!start_location_set) { + PM_NODE_START_SET_NODE(current, part); + } + break; + } + default: + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_UPPER_ELEMENT); + parser_lex(parser); + break; + } + } + + /* If we have a current node, then we need to append it to the list. */ + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + } + + pm_token_t closing = parser->current; + if (match1(parser, PM_TOKEN_EOF)) { + pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_UPPER_TERM); + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + } else { + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_UPPER_TERM); + } + pm_array_node_close_set(parser, array, &closing); + + return UP(array); +} + +/** + * Parse a parenthesized expression, which could be a grouping, a multi-target + * assignment, or a set of statements. + */ +static pm_node_t * +parse_parentheses(pm_parser_t *parser, pm_binding_power_t binding_power, uint16_t depth) { + pm_token_t opening = parser->current; + pm_node_flags_t paren_flags = 0; + + pm_node_list_t current_block_exits = { 0 }; + pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); + + parser_lex(parser); + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + + /* If this is the end of the file or we match a right parenthesis, then we + * have an empty parentheses node, and we can immediately return. */ + if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_EOF)) { + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + pop_block_exits(parser, previous_block_exits); + return UP(pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, paren_flags)); + } + + /* Otherwise, we're going to parse the first statement in the list of + * statements within the parentheses. */ + pm_accepts_block_stack_push(parser, true); + context_push(parser, PM_CONTEXT_PARENS); + pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + context_pop(parser); + + /* Determine if this statement is followed by a terminator. In the case of a + * single statement, this is fine. But in the case of multiple statements + * it's required. */ + bool terminator_found = false; + + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + terminator_found = true; + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (accept1(parser, PM_TOKEN_NEWLINE)) { + terminator_found = true; + } + + if (terminator_found) { + while (true) { + if (accept1(parser, PM_TOKEN_SEMICOLON)) { + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { + break; + } + } + } + + /* If we hit a right parenthesis, then we're done parsing the parentheses + * node, and we can check which kind of node we should return. */ + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + if (opening.type == PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES) { + lex_state_set(parser, PM_LEX_STATE_ENDARG); + } + + parser_lex(parser); + pm_accepts_block_stack_pop(parser); + pop_block_exits(parser, previous_block_exits); + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) || PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + /* If we have a single statement and are ending on a right + * parenthesis, then we need to check if this is possibly a multiple + * target node. */ + pm_multi_target_node_t *multi_target; + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) && ((pm_multi_target_node_t *) statement)->lparen_loc.length == 0) { + multi_target = (pm_multi_target_node_t *) statement; + } else { + multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + } + + multi_target->lparen_loc = TOK2LOC(parser, &opening); + multi_target->rparen_loc = TOK2LOC(parser, &parser->previous); + PM_NODE_START_SET_TOKEN(parser, multi_target, &opening); + PM_NODE_LENGTH_SET_TOKEN(parser, multi_target, &parser->previous); + + pm_node_t *result; + if (match1(parser, PM_TOKEN_COMMA) && (binding_power == PM_BINDING_POWER_STATEMENT)) { + result = parse_targets(parser, UP(multi_target), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + accept1(parser, PM_TOKEN_NEWLINE); + } else { + result = UP(multi_target); + } + + if (context_p(parser, PM_CONTEXT_MULTI_TARGET)) { + /* All set, this is explicitly allowed by the parent context. */ + } else if (context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) { + /* All set, we're inside a for loop and we're parsing multiple + * targets. */ + } else if (binding_power != PM_BINDING_POWER_STATEMENT) { + /* Multi targets are not allowed when it's not a statement + * level. */ + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } else if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { + /* Multi targets must be followed by an equal sign in order to + * be valid (or a right parenthesis if they are nested). */ + pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + + return result; + } + + /* If we have a single statement and are ending on a right parenthesis + * and we didn't return a multiple assignment node, then we can return a + * regular parentheses node now. */ + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); + } + + /* If we have more than one statement in the set of parentheses, then we are + * going to parse all of them as a list of statements. We'll do that here. + */ + context_push(parser, PM_CONTEXT_PARENS); + paren_flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; + + pm_statements_node_t *statements = pm_statements_node_create(parser); + pm_statements_node_body_append(parser, statements, statement, true); + + /* If we didn't find a terminator and we didn't find a right parenthesis, + * then this is a syntax error. */ + if (!terminator_found && !match1(parser, PM_TOKEN_EOF)) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + + /* Parse each statement within the parentheses. */ + while (true) { + pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); + pm_statements_node_body_append(parser, statements, node, true); + + /* If we're recovering from a syntax error, then we need to stop parsing + * the statements now. */ + if (parser->recovering) { + /* If this is the level of context where the recovery has happened, + * then we can mark the parser as done recovering. */ + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) parser->recovering = false; + break; + } + + /* If we couldn't parse an expression at all, then we need to bail out + * of the loop. */ + if (PM_NODE_TYPE_P(node, PM_ERROR_RECOVERY_NODE)) break; + + /* If we successfully parsed a statement, then we are going to need a + * terminator to delimit them. */ + if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { + while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); + if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) break; + } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { + break; + } else if (!match1(parser, PM_TOKEN_EOF)) { + /* If we're at the end of the file, then we're going to add an error + * after this for the ) anyway. */ + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + } + } + + context_pop(parser); + pm_accepts_block_stack_pop(parser); + expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); + + /* When we're parsing multi targets, we allow them to be followed by a right + * parenthesis if they are at the statement level. This is only possible if + * they are the final statement in a parentheses. We need to explicitly + * reject that here. */ + { + pm_node_t *statement = statements->body.nodes[statements->body.size - 1]; + + if (PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { + pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); + pm_multi_target_node_targets_append(parser, multi_target, statement); + + statement = UP(multi_target); + statements->body.nodes[statements->body.size - 1] = statement; + } + + if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE)) { + const uint8_t *offset = parser->start + PM_NODE_END(statement); + pm_token_t operator = { .type = PM_TOKEN_EQUAL, .start = offset, .end = offset }; + pm_node_t *value = UP(pm_error_recovery_node_create(parser, PM_NODE_END(statement), 0)); + + statement = UP(pm_multi_write_node_create(parser, (pm_multi_target_node_t *) statement, &operator, value)); + statements->body.nodes[statements->body.size - 1] = statement; + + pm_parser_err_node(parser, statement, PM_ERR_WRITE_TARGET_UNEXPECTED); + } + } + + pop_block_exits(parser, previous_block_exits); + pm_void_statements_check(parser, statements, true); + return UP(pm_parentheses_node_create(parser, &opening, UP(statements), &parser->previous, paren_flags)); } /** * Parse an expression that begins with the previous node that we just lexed. */ -static inline pm_node_t * -parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { +static PRISM_INLINE pm_node_t * +parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { switch (parser->current.type) { case PM_TOKEN_BRACKET_LEFT_ARRAY: { parser_lex(parser); @@ -18092,11 +19089,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } else { // If there was no comma, then we need to add a syntax // error. - const uint8_t *location = parser->previous.end; - PM_PARSER_ERR_FORMAT(parser, location, location, PM_ERR_ARRAY_SEPARATOR, pm_token_type_human(parser->current.type)); - - parser->previous.start = location; - parser->previous.type = PM_TOKEN_MISSING; + PM_PARSER_ERR_FORMAT(parser, PM_TOKEN_END(parser, &parser->previous), 0, PM_ERR_ARRAY_SEPARATOR, pm_token_str(parser->current.type)); + parser->previous.start = parser->previous.end; + parser->previous.type = 0; } } @@ -18114,28 +19109,28 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b if (match3(parser, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_COMMA, PM_TOKEN_EOF)) { pm_parser_scope_forwarding_positionals_check(parser, &operator); } else { - expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_ARRAY_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_ARRAY_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); } - element = (pm_node_t *) pm_splat_node_create(parser, &operator, expression); + element = UP(pm_splat_node_create(parser, &operator, expression)); } else if (match2(parser, PM_TOKEN_LABEL, PM_TOKEN_USTAR_STAR)) { if (parsed_bare_hash) { pm_parser_err_current(parser, PM_ERR_EXPRESSION_BARE_HASH); } - element = (pm_node_t *) pm_keyword_hash_node_create(parser); + element = UP(pm_keyword_hash_node_create(parser)); pm_static_literals_t hash_keys = { 0 }; - if (!match8(parser, PM_TOKEN_EOF, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_EOF, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_KEYWORD_DO, PM_TOKEN_PARENTHESIS_RIGHT)) { + if (!match8(parser, PM_TOKEN_EOF, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_TOKEN_KEYWORD_DO_BLOCK, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_BRACKET_RIGHT, PM_TOKEN_KEYWORD_DO, PM_TOKEN_PARENTHESIS_RIGHT)) { parse_assocs(parser, &hash_keys, element, (uint16_t) (depth + 1)); } pm_static_literals_free(&hash_keys); parsed_bare_hash = true; } else { - element = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, true, PM_ERR_ARRAY_EXPRESSION, (uint16_t) (depth + 1)); + element = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_LABEL), PM_ERR_ARRAY_EXPRESSION, (uint16_t) (depth + 1)); - if (pm_symbol_node_label_p(element) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) { + if (pm_symbol_node_label_p(parser, element) || accept1(parser, PM_TOKEN_EQUAL_GREATER)) { if (parsed_bare_hash) { pm_parser_err_previous(parser, PM_ERR_EXPRESSION_BARE_HASH); } @@ -18144,18 +19139,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_static_literals_t hash_keys = { 0 }; pm_hash_key_static_literals_add(parser, &hash_keys, element); - pm_token_t operator; + pm_token_t operator = { 0 }; if (parser->previous.type == PM_TOKEN_EQUAL_GREATER) { operator = parser->previous; - } else { - operator = not_provided(parser); } - pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); - pm_node_t *assoc = (pm_node_t *) pm_assoc_node_create(parser, element, &operator, value); - pm_keyword_hash_node_elements_append(hash, assoc); + pm_node_t *value = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_HASH_VALUE, (uint16_t) (depth + 1)); + pm_node_t *assoc = UP(pm_assoc_node_create(parser, element, NTOK2PTR(operator), value)); + pm_keyword_hash_node_elements_append(parser->arena, hash, assoc); - element = (pm_node_t *) hash; + element = UP(hash); if (accept1(parser, PM_TOKEN_COMMA) && !match1(parser, PM_TOKEN_BRACKET_RIGHT)) { parse_assocs(parser, &hash_keys, element, (uint16_t) (depth + 1)); } @@ -18165,236 +19158,26 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } } - pm_array_node_elements_append(array, element); - if (PM_NODE_TYPE_P(element, PM_MISSING_NODE)) break; + pm_array_node_elements_append(parser->arena, array, element); + if (PM_NODE_TYPE_P(element, PM_ERROR_RECOVERY_NODE)) break; } accept1(parser, PM_TOKEN_NEWLINE); if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_ARRAY_TERM, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_ARRAY_TERM, pm_token_str(parser->current.type)); parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; + parser->previous.type = 0; } - pm_array_node_close_set(array, &parser->previous); + pm_array_node_close_set(parser, array, &parser->previous); pm_accepts_block_stack_pop(parser); - return (pm_node_t *) array; + return UP(array); } case PM_TOKEN_PARENTHESIS_LEFT: - case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: { - pm_token_t opening = parser->current; - pm_node_flags_t flags = 0; - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - parser_lex(parser); - while (true) { - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { - break; - } - } - - // If this is the end of the file or we match a right parenthesis, then - // we have an empty parentheses node, and we can immediately return. - if (match2(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_TOKEN_EOF)) { - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_parentheses_node_create(parser, &opening, NULL, &parser->previous, flags); - } - - // Otherwise, we're going to parse the first statement in the list - // of statements within the parentheses. - pm_accepts_block_stack_push(parser, true); - context_push(parser, PM_CONTEXT_PARENS); - pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); - context_pop(parser); - - // Determine if this statement is followed by a terminator. In the - // case of a single statement, this is fine. But in the case of - // multiple statements it's required. - bool terminator_found = false; - - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - terminator_found = true; - flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (accept1(parser, PM_TOKEN_NEWLINE)) { - terminator_found = true; - } - - if (terminator_found) { - while (true) { - if (accept1(parser, PM_TOKEN_SEMICOLON)) { - flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - } else if (!accept1(parser, PM_TOKEN_NEWLINE)) { - break; - } - } - } - - // If we hit a right parenthesis, then we're done parsing the - // parentheses node, and we can check which kind of node we should - // return. - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - if (opening.type == PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES) { - lex_state_set(parser, PM_LEX_STATE_ENDARG); - } - - parser_lex(parser); - pm_accepts_block_stack_pop(parser); - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) || PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { - // If we have a single statement and are ending on a right - // parenthesis, then we need to check if this is possibly a - // multiple target node. - pm_multi_target_node_t *multi_target; - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE) && ((pm_multi_target_node_t *) statement)->lparen_loc.start == NULL) { - multi_target = (pm_multi_target_node_t *) statement; - } else { - multi_target = pm_multi_target_node_create(parser); - pm_multi_target_node_targets_append(parser, multi_target, statement); - } - - pm_location_t lparen_loc = PM_LOCATION_TOKEN_VALUE(&opening); - pm_location_t rparen_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); - - multi_target->lparen_loc = lparen_loc; - multi_target->rparen_loc = rparen_loc; - multi_target->base.location.start = lparen_loc.start; - multi_target->base.location.end = rparen_loc.end; - - pm_node_t *result; - if (match1(parser, PM_TOKEN_COMMA) && (binding_power == PM_BINDING_POWER_STATEMENT)) { - result = parse_targets(parser, (pm_node_t *) multi_target, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); - accept1(parser, PM_TOKEN_NEWLINE); - } else { - result = (pm_node_t *) multi_target; - } - - if (context_p(parser, PM_CONTEXT_MULTI_TARGET)) { - // All set, this is explicitly allowed by the parent - // context. - } else if (context_p(parser, PM_CONTEXT_FOR_INDEX) && match1(parser, PM_TOKEN_KEYWORD_IN)) { - // All set, we're inside a for loop and we're parsing - // multiple targets. - } else if (binding_power != PM_BINDING_POWER_STATEMENT) { - // Multi targets are not allowed when it's not a - // statement level. - pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); - } else if (!match2(parser, PM_TOKEN_EQUAL, PM_TOKEN_PARENTHESIS_RIGHT)) { - // Multi targets must be followed by an equal sign in - // order to be valid (or a right parenthesis if they are - // nested). - pm_parser_err_node(parser, result, PM_ERR_WRITE_TARGET_UNEXPECTED); - } - - return result; - } - - // If we have a single statement and are ending on a right parenthesis - // and we didn't return a multiple assignment node, then we can return a - // regular parentheses node now. - pm_statements_node_t *statements = pm_statements_node_create(parser); - pm_statements_node_body_append(parser, statements, statement, true); - - return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags); - } - - // If we have more than one statement in the set of parentheses, - // then we are going to parse all of them as a list of statements. - // We'll do that here. - context_push(parser, PM_CONTEXT_PARENS); - flags |= PM_PARENTHESES_NODE_FLAGS_MULTIPLE_STATEMENTS; - - pm_statements_node_t *statements = pm_statements_node_create(parser); - pm_statements_node_body_append(parser, statements, statement, true); - - // If we didn't find a terminator and we didn't find a right - // parenthesis, then this is a syntax error. - if (!terminator_found && !match1(parser, PM_TOKEN_EOF)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); - } - - // Parse each statement within the parentheses. - while (true) { - pm_node_t *node = parse_expression(parser, PM_BINDING_POWER_STATEMENT, true, false, PM_ERR_CANNOT_PARSE_EXPRESSION, (uint16_t) (depth + 1)); - pm_statements_node_body_append(parser, statements, node, true); - - // If we're recovering from a syntax error, then we need to stop - // parsing the statements now. - if (parser->recovering) { - // If this is the level of context where the recovery has - // happened, then we can mark the parser as done recovering. - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) parser->recovering = false; - break; - } - - // If we couldn't parse an expression at all, then we need to - // bail out of the loop. - if (PM_NODE_TYPE_P(node, PM_MISSING_NODE)) break; - - // If we successfully parsed a statement, then we are going to - // need terminator to delimit them. - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) break; - } else if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - break; - } else if (!match1(parser, PM_TOKEN_EOF)) { - // If we're at the end of the file, then we're going to add - // an error after this for the ) anyway. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(parser->current.type)); - } - } - - context_pop(parser); - pm_accepts_block_stack_pop(parser); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - - // When we're parsing multi targets, we allow them to be followed by - // a right parenthesis if they are at the statement level. This is - // only possible if they are the final statement in a parentheses. - // We need to explicitly reject that here. - { - pm_node_t *statement = statements->body.nodes[statements->body.size - 1]; - - if (PM_NODE_TYPE_P(statement, PM_SPLAT_NODE)) { - pm_multi_target_node_t *multi_target = pm_multi_target_node_create(parser); - pm_multi_target_node_targets_append(parser, multi_target, statement); - - statement = (pm_node_t *) multi_target; - statements->body.nodes[statements->body.size - 1] = statement; - } - - if (PM_NODE_TYPE_P(statement, PM_MULTI_TARGET_NODE)) { - const uint8_t *offset = statement->location.end; - pm_token_t operator = { .type = PM_TOKEN_EQUAL, .start = offset, .end = offset }; - pm_node_t *value = (pm_node_t *) pm_missing_node_create(parser, offset, offset); - - statement = (pm_node_t *) pm_multi_write_node_create(parser, (pm_multi_target_node_t *) statement, &operator, value); - statements->body.nodes[statements->body.size - 1] = statement; - - pm_parser_err_node(parser, statement, PM_ERR_WRITE_TARGET_UNEXPECTED); - } - } - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - pm_void_statements_check(parser, statements, true); - return (pm_node_t *) pm_parentheses_node_create(parser, &opening, (pm_node_t *) statements, &parser->previous, flags); - } + case PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES: + return parse_parentheses(parser, binding_power, depth); case PM_TOKEN_BRACE_LEFT: { // If we were passed a current_hash_keys via the parser, then that // means we're already parsing a hash and we want to share the set @@ -18409,14 +19192,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_accepts_block_stack_push(parser, true); parser_lex(parser); - pm_hash_node_t *node = pm_hash_node_create(parser, &parser->previous); + pm_token_t opening = parser->previous; + pm_hash_node_t *node = pm_hash_node_create(parser, &opening); if (!match2(parser, PM_TOKEN_BRACE_RIGHT, PM_TOKEN_EOF)) { if (current_hash_keys != NULL) { - parse_assocs(parser, current_hash_keys, (pm_node_t *) node, (uint16_t) (depth + 1)); + parse_assocs(parser, current_hash_keys, UP(node), (uint16_t) (depth + 1)); } else { pm_static_literals_t hash_keys = { 0 }; - parse_assocs(parser, &hash_keys, (pm_node_t *) node, (uint16_t) (depth + 1)); + parse_assocs(parser, &hash_keys, UP(node), (uint16_t) (depth + 1)); pm_static_literals_free(&hash_keys); } @@ -18424,26 +19208,33 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } pm_accepts_block_stack_pop(parser); - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_HASH_TERM); - pm_hash_node_closing_loc_set(node, &parser->previous); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_HASH_TERM, &opening); + pm_hash_node_closing_loc_set(parser, node, &parser->previous); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_CHARACTER_LITERAL: { - parser_lex(parser); - - pm_token_t opening = parser->previous; - opening.type = PM_TOKEN_STRING_BEGIN; - opening.end = opening.start + 1; - - pm_token_t content = parser->previous; - content.type = PM_TOKEN_STRING_CONTENT; - content.start = content.start + 1; + pm_node_t *node = UP(pm_string_node_create_current_string( + parser, + &(pm_token_t) { + .type = PM_TOKEN_STRING_BEGIN, + .start = parser->current.start, + .end = parser->current.start + 1 + }, + &(pm_token_t) { + .type = PM_TOKEN_STRING_CONTENT, + .start = parser->current.start + 1, + .end = parser->current.end + }, + NULL + )); - pm_token_t closing = not_provided(parser); - pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &content, &closing); pm_node_flag_set(node, parse_unescaped_encoding(parser)); + // Skip past the character literal here, since now we have handled + // parser->explicit_encoding correctly. + parser_lex(parser); + // Characters can be followed by strings in which case they are // automatically concatenated. if (match1(parser, PM_TOKEN_STRING_BEGIN)) { @@ -18454,7 +19245,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_CLASS_VARIABLE: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_class_variable_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_class_variable_read_node_create(parser, &parser->previous)); if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18470,16 +19261,16 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // fact a method call, not a constant read. if ( match1(parser, PM_TOKEN_PARENTHESIS_LEFT) || - (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || + ((flags & PM_PARSE_ACCEPTS_COMMAND_CALL) && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) || match1(parser, PM_TOKEN_BRACE_LEFT) ) { pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_node_fcall_create(parser, &constant, &arguments); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); + return UP(pm_call_node_fcall_create(parser, &constant, &arguments)); } - pm_node_t *node = (pm_node_t *) pm_constant_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_constant_read_node_create(parser, &parser->previous)); if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) { // If we get here, then we have a comma immediately following a @@ -18494,7 +19285,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t delimiter = parser->previous; expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); - pm_node_t *node = (pm_node_t *) pm_constant_path_node_create(parser, NULL, &delimiter, &parser->previous); + pm_node_t *node = UP(pm_constant_path_node_create(parser, NULL, &delimiter, &parser->previous)); if ((binding_power == PM_BINDING_POWER_STATEMENT) && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18507,7 +19298,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t operator = parser->current; parser_lex(parser); - pm_node_t *right = parse_expression(parser, pm_binding_powers[operator.type].left, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, pm_binding_powers[operator.type].left, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); // Unary .. and ... are special because these are non-associative // operators that can also be unary operators. In this case we need @@ -18517,23 +19308,23 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_parser_err_current(parser, PM_ERR_UNEXPECTED_RANGE_OPERATOR); } - return (pm_node_t *) pm_range_node_create(parser, NULL, &operator, right); + return UP(pm_range_node_create(parser, NULL, &operator, right)); } case PM_TOKEN_FLOAT: parser_lex(parser); - return (pm_node_t *) pm_float_node_create(parser, &parser->previous); + return UP(pm_float_node_create(parser, &parser->previous)); case PM_TOKEN_FLOAT_IMAGINARY: parser_lex(parser); - return (pm_node_t *) pm_float_node_imaginary_create(parser, &parser->previous); + return UP(pm_float_node_imaginary_create(parser, &parser->previous)); case PM_TOKEN_FLOAT_RATIONAL: parser_lex(parser); - return (pm_node_t *) pm_float_node_rational_create(parser, &parser->previous); + return UP(pm_float_node_rational_create(parser, &parser->previous)); case PM_TOKEN_FLOAT_RATIONAL_IMAGINARY: parser_lex(parser); - return (pm_node_t *) pm_float_node_rational_imaginary_create(parser, &parser->previous); + return UP(pm_float_node_rational_imaginary_create(parser, &parser->previous)); case PM_TOKEN_NUMBERED_REFERENCE: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_numbered_reference_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_numbered_reference_read_node_create(parser, &parser->previous)); if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18543,7 +19334,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_GLOBAL_VARIABLE: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_global_variable_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_global_variable_read_node_create(parser, &parser->previous)); if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18553,7 +19344,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_BACK_REFERENCE: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_back_reference_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_back_reference_read_node_create(parser, &parser->previous)); if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18575,26 +19366,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_call_node_t *call = (pm_call_node_t *) node; pm_arguments_t arguments = { 0 }; - if (parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1))) { + if (parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1))) { // Since we found arguments, we need to turn off the // variable call bit in the flags. - pm_node_flag_unset((pm_node_t *)call, PM_CALL_NODE_FLAGS_VARIABLE_CALL); + pm_node_flag_unset(UP(call), PM_CALL_NODE_FLAGS_VARIABLE_CALL); call->opening_loc = arguments.opening_loc; call->arguments = arguments.arguments; call->closing_loc = arguments.closing_loc; call->block = arguments.block; - if (arguments.block != NULL) { - call->base.location.end = arguments.block->location.end; - } else if (arguments.closing_loc.start == NULL) { - if (arguments.arguments != NULL) { - call->base.location.end = arguments.arguments->base.location.end; - } else { - call->base.location.end = call->message_loc.end; - } + const pm_location_t *end = pm_arguments_end(&arguments); + if (end == NULL) { + PM_NODE_LENGTH_SET_LOCATION(call, &call->message_loc); } else { - call->base.location.end = arguments.closing_loc.end; + PM_NODE_LENGTH_SET_LOCATION(call, end); } } } else { @@ -18602,19 +19388,19 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // can still be a method call if it is followed by arguments or // a block, so we need to check for that here. if ( - (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || + ((flags & PM_PARSE_ACCEPTS_COMMAND_CALL) && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) || (pm_accepts_block_stack_p(parser) && match1(parser, PM_TOKEN_KEYWORD_DO)) || match1(parser, PM_TOKEN_BRACE_LEFT) ) { pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); pm_call_node_t *fcall = pm_call_node_fcall_create(parser, &identifier, &arguments); if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { // If we're about to convert an 'it' implicit local // variable read into a method call, we need to remove // it from the list of implicit local variables. - parse_target_implicit_parameter(parser, node); + pm_node_unreference(parser, node); } else { // Otherwise, we're about to convert a regular local // variable read into a method call, in which case we @@ -18622,16 +19408,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // purposes of warnings. assert(PM_NODE_TYPE_P(node, PM_LOCAL_VARIABLE_READ_NODE)); - if (pm_token_is_numbered_parameter(identifier.start, identifier.end)) { - parse_target_implicit_parameter(parser, node); + if (pm_token_is_numbered_parameter(parser, PM_TOKEN_START(parser, &identifier), PM_TOKEN_LENGTH(&identifier))) { + pm_node_unreference(parser, node); } else { pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; pm_locals_unread(&pm_parser_scope_find(parser, cast->depth)->locals, cast->name); } } - pm_node_destroy(parser, node); - return (pm_node_t *) fcall; + return UP(fcall); } } @@ -18663,12 +19448,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t content = parse_strings_empty_content(parser->previous.start); if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { - node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY); + node = UP(pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY)); } else { - node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY); + node = UP(pm_string_node_create_unescaped(parser, &opening, &content, &parser->previous, &PM_STRING_EMPTY)); } - node->location.end = opening.end; + PM_NODE_LENGTH_SET_TOKEN(parser, node, &opening); } else if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) == NULL) { // If we get here, then we tried to find something in the // heredoc but couldn't actually parse anything, so we'll just @@ -18676,7 +19461,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // // parse_string_part handles its own errors, so there is no need // for us to add one here. - node = (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + node = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous))); } else if (PM_NODE_TYPE_P(part, PM_STRING_NODE) && match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { // If we get here, then the part that we parsed was plain string // content and we're at the end of the heredoc, so we can return @@ -18685,8 +19470,8 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_node_flag_set(part, parse_unescaped_encoding(parser)); pm_string_node_t *cast = (pm_string_node_t *) part; - cast->opening_loc = PM_LOCATION_TOKEN_VALUE(&opening); - cast->closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->current); + cast->opening_loc = TOK2LOC(parser, &opening); + cast->closing_loc = TOK2LOC(parser, &parser->current); cast->base.location = cast->opening_loc; if (lex_mode.quote == PM_HEREDOC_QUOTE_BACKTICK) { @@ -18695,21 +19480,21 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } if (lex_mode.indent == PM_HEREDOC_INDENT_TILDE && (common_whitespace != (size_t) -1) && (common_whitespace != 0)) { - parse_heredoc_dedent_string(&cast->unescaped, common_whitespace); + parse_heredoc_dedent_string(parser->arena, &cast->unescaped, common_whitespace); } - node = (pm_node_t *) cast; + node = UP(cast); expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); } else { // If we get here, then we have multiple parts in the heredoc, // so we'll need to create an interpolated string node to hold // them all. pm_node_list_t parts = { 0 }; - pm_node_list_append(&parts, part); + pm_node_list_append(parser->arena, &parts, part); while (!match2(parser, PM_TOKEN_HEREDOC_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_node_list_append(&parts, part); + pm_node_list_append(parser->arena, &parts, part); } } @@ -18720,19 +19505,18 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b cast->parts = parts; expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); - pm_interpolated_xstring_node_closing_set(cast, &parser->previous); + pm_interpolated_xstring_node_closing_set(parser, cast, &parser->previous); cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + node = UP(cast); } else { pm_interpolated_string_node_t *cast = pm_interpolated_string_node_create(parser, &opening, &parts, &opening); - pm_node_list_free(&parts); expect1_heredoc_term(parser, lex_mode.ident_start, lex_mode.ident_length); - pm_interpolated_string_node_closing_set(cast, &parser->previous); + pm_interpolated_string_node_closing_set(parser, cast, &parser->previous); cast->base.location = cast->opening_loc; - node = (pm_node_t *) cast; + node = UP(cast); } // If this is a heredoc that is indented with a ~, then we need @@ -18757,7 +19541,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } case PM_TOKEN_INSTANCE_VARIABLE: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_instance_variable_read_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_instance_variable_read_node_create(parser, &parser->previous)); if (binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { node = parse_targets_validate(parser, node, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -18766,34 +19550,34 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b return node; } case PM_TOKEN_INTEGER: { - pm_node_flags_t base = parser->integer_base; + pm_node_flags_t base = parser->integer.base; parser_lex(parser); - return (pm_node_t *) pm_integer_node_create(parser, base, &parser->previous); + return UP(pm_integer_node_create(parser, base, &parser->previous)); } case PM_TOKEN_INTEGER_IMAGINARY: { - pm_node_flags_t base = parser->integer_base; + pm_node_flags_t base = parser->integer.base; parser_lex(parser); - return (pm_node_t *) pm_integer_node_imaginary_create(parser, base, &parser->previous); + return UP(pm_integer_node_imaginary_create(parser, base, &parser->previous)); } case PM_TOKEN_INTEGER_RATIONAL: { - pm_node_flags_t base = parser->integer_base; + pm_node_flags_t base = parser->integer.base; parser_lex(parser); - return (pm_node_t *) pm_integer_node_rational_create(parser, base, &parser->previous); + return UP(pm_integer_node_rational_create(parser, base, &parser->previous)); } case PM_TOKEN_INTEGER_RATIONAL_IMAGINARY: { - pm_node_flags_t base = parser->integer_base; + pm_node_flags_t base = parser->integer.base; parser_lex(parser); - return (pm_node_t *) pm_integer_node_rational_imaginary_create(parser, base, &parser->previous); + return UP(pm_integer_node_rational_imaginary_create(parser, base, &parser->previous)); } case PM_TOKEN_KEYWORD___ENCODING__: parser_lex(parser); - return (pm_node_t *) pm_source_encoding_node_create(parser, &parser->previous); + return UP(pm_source_encoding_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD___FILE__: parser_lex(parser); - return (pm_node_t *) pm_source_file_node_create(parser, &parser->previous); + return UP(pm_source_file_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD___LINE__: parser_lex(parser); - return (pm_node_t *) pm_source_line_node_create(parser, &parser->previous); + return UP(pm_source_line_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD_ALIAS: { if (binding_power != PM_BINDING_POWER_STATEMENT) { pm_parser_err_current(parser, PM_ERR_STATEMENT_ALIAS); @@ -18813,245 +19597,27 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b if (PM_NODE_TYPE_P(old_name, PM_NUMBERED_REFERENCE_READ_NODE)) { pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT_NUMBERED_REFERENCE); } - } else { + } else if (!PM_NODE_TYPE_P(old_name, PM_ERROR_RECOVERY_NODE)) { pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT); + old_name = UP(pm_error_recovery_node_create_unexpected(parser, old_name)); } - return (pm_node_t *) pm_alias_global_variable_node_create(parser, &keyword, new_name, old_name); + return UP(pm_alias_global_variable_node_create(parser, &keyword, new_name, old_name)); } case PM_SYMBOL_NODE: case PM_INTERPOLATED_SYMBOL_NODE: { - if (!PM_NODE_TYPE_P(old_name, PM_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_INTERPOLATED_SYMBOL_NODE)) { + if (!PM_NODE_TYPE_P(old_name, PM_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_INTERPOLATED_SYMBOL_NODE) && !PM_NODE_TYPE_P(old_name, PM_ERROR_RECOVERY_NODE)) { pm_parser_err_node(parser, old_name, PM_ERR_ALIAS_ARGUMENT); + old_name = UP(pm_error_recovery_node_create_unexpected(parser, old_name)); } } PRISM_FALLTHROUGH default: - return (pm_node_t *) pm_alias_method_node_create(parser, &keyword, new_name, old_name); - } - } - case PM_TOKEN_KEYWORD_CASE: { - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - - pm_token_t case_keyword = parser->previous; - pm_node_t *predicate = NULL; - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - predicate = NULL; - } else if (match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_END)) { - predicate = NULL; - } else if (!token_begins_expression_p(parser->current.type)) { - predicate = NULL; - } else { - predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CASE_EXPRESSION_AFTER_CASE, (uint16_t) (depth + 1)); - while (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)); - } - - if (match1(parser, PM_TOKEN_KEYWORD_END)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); - parser_lex(parser); - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - return (pm_node_t *) pm_case_node_create(parser, &case_keyword, predicate, &parser->previous); - } - - // At this point we can create a case node, though we don't yet know - // if it is a case-in or case-when node. - pm_token_t end_keyword = not_provided(parser); - pm_node_t *node; - - if (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { - pm_case_node_t *case_node = pm_case_node_create(parser, &case_keyword, predicate, &end_keyword); - pm_static_literals_t literals = { 0 }; - - // At this point we've seen a when keyword, so we know this is a - // case-when node. We will continue to parse the when nodes - // until we hit the end of the list. - while (match1(parser, PM_TOKEN_KEYWORD_WHEN)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); - parser_lex(parser); - - pm_token_t when_keyword = parser->previous; - pm_when_node_t *when_node = pm_when_node_create(parser, &when_keyword); - - do { - if (accept1(parser, PM_TOKEN_USTAR)) { - pm_token_t operator = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); - - pm_splat_node_t *splat_node = pm_splat_node_create(parser, &operator, expression); - pm_when_node_conditions_append(when_node, (pm_node_t *) splat_node); - - if (PM_NODE_TYPE_P(expression, PM_MISSING_NODE)) break; - } else { - pm_node_t *condition = parse_value_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_CASE_EXPRESSION_AFTER_WHEN, (uint16_t) (depth + 1)); - pm_when_node_conditions_append(when_node, condition); - - // If we found a missing node, then this is a syntax - // error and we should stop looping. - if (PM_NODE_TYPE_P(condition, PM_MISSING_NODE)) break; - - // If this is a string node, then we need to mark it - // as frozen because when clause strings are frozen. - if (PM_NODE_TYPE_P(condition, PM_STRING_NODE)) { - pm_node_flag_set(condition, PM_STRING_FLAGS_FROZEN | PM_NODE_FLAG_STATIC_LITERAL); - } else if (PM_NODE_TYPE_P(condition, PM_SOURCE_FILE_NODE)) { - pm_node_flag_set(condition, PM_NODE_FLAG_STATIC_LITERAL); - } - - pm_when_clause_static_literals_add(parser, &literals, condition); - } - } while (accept1(parser, PM_TOKEN_COMMA)); - - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { - pm_when_node_then_keyword_loc_set(when_node, &parser->previous); - } - } else { - expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_WHEN_DELIMITER); - pm_when_node_then_keyword_loc_set(when_node, &parser->previous); - } - - if (!match3(parser, PM_TOKEN_KEYWORD_WHEN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_CASE_WHEN, (uint16_t) (depth + 1)); - if (statements != NULL) { - pm_when_node_statements_set(when_node, statements); - } - } - - pm_case_node_condition_append(case_node, (pm_node_t *) when_node); - } - - // If we didn't parse any conditions (in or when) then we need - // to indicate that we have an error. - if (case_node->conditions.size == 0) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - } - - pm_static_literals_free(&literals); - node = (pm_node_t *) case_node; - } else { - pm_case_match_node_t *case_node = pm_case_match_node_create(parser, &case_keyword, predicate, &end_keyword); - - // If this is a case-match node (i.e., it is a pattern matching - // case statement) then we must have a predicate. - if (predicate == NULL) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MATCH_MISSING_PREDICATE); - } - - // At this point we expect that we're parsing a case-in node. We - // will continue to parse the in nodes until we hit the end of - // the list. - while (match1(parser, PM_TOKEN_KEYWORD_IN)) { - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, true); - - bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; - parser->pattern_matching_newlines = true; - - lex_state_set(parser, PM_LEX_STATE_BEG | PM_LEX_STATE_LABEL); - parser->command_start = false; - parser_lex(parser); - - pm_token_t in_keyword = parser->previous; - - pm_constant_id_list_t captures = { 0 }; - pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); - - parser->pattern_matching_newlines = previous_pattern_matching_newlines; - pm_constant_id_list_free(&captures); - - // Since we're in the top-level of the case-in node we need - // to check for guard clauses in the form of `if` or - // `unless` statements. - if (accept1(parser, PM_TOKEN_KEYWORD_IF_MODIFIER)) { - pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); - pattern = (pm_node_t *) pm_if_node_modifier_create(parser, pattern, &keyword, predicate); - } else if (accept1(parser, PM_TOKEN_KEYWORD_UNLESS_MODIFIER)) { - pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); - pattern = (pm_node_t *) pm_unless_node_modifier_create(parser, pattern, &keyword, predicate); - } - - // Now we need to check for the terminator of the in node's - // pattern. It can be a newline or semicolon optionally - // followed by a `then` keyword. - pm_token_t then_keyword; - if (accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - if (accept1(parser, PM_TOKEN_KEYWORD_THEN)) { - then_keyword = parser->previous; - } else { - then_keyword = not_provided(parser); - } - } else { - expect1(parser, PM_TOKEN_KEYWORD_THEN, PM_ERR_EXPECT_IN_DELIMITER); - then_keyword = parser->previous; - } - - // Now we can actually parse the statements associated with - // the in node. - pm_statements_node_t *statements; - if (match3(parser, PM_TOKEN_KEYWORD_IN, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - statements = NULL; - } else { - statements = parse_statements(parser, PM_CONTEXT_CASE_IN, (uint16_t) (depth + 1)); - } - - // Now that we have the full pattern and statements, we can - // create the node and attach it to the case node. - pm_node_t *condition = (pm_node_t *) pm_in_node_create(parser, pattern, statements, &in_keyword, &then_keyword); - pm_case_match_node_condition_append(case_node, condition); - } - - // If we didn't parse any conditions (in or when) then we need - // to indicate that we have an error. - if (case_node->conditions.size == 0) { - pm_parser_err_token(parser, &case_keyword, PM_ERR_CASE_MISSING_CONDITIONS); - } - - node = (pm_node_t *) case_node; - } - - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - if (accept1(parser, PM_TOKEN_KEYWORD_ELSE)) { - pm_token_t else_keyword = parser->previous; - pm_else_node_t *else_node; - - if (!match1(parser, PM_TOKEN_KEYWORD_END)) { - else_node = pm_else_node_create(parser, &else_keyword, parse_statements(parser, PM_CONTEXT_ELSE, (uint16_t) (depth + 1)), &parser->current); - } else { - else_node = pm_else_node_create(parser, &else_keyword, NULL, &parser->current); - } - - if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { - pm_case_node_else_clause_set((pm_case_node_t *) node, else_node); - } else { - pm_case_match_node_else_clause_set((pm_case_match_node_t *) node, else_node); - } - } - - parser_warn_indentation_mismatch(parser, opening_newline_index, &case_keyword, false, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CASE_TERM); - - if (PM_NODE_TYPE_P(node, PM_CASE_NODE)) { - pm_case_node_end_keyword_loc_set((pm_case_node_t *) node, &parser->previous); - } else { - pm_case_match_node_end_keyword_loc_set((pm_case_match_node_t *) node, &parser->previous); + return UP(pm_alias_method_node_create(parser, &keyword, new_name, old_name)); } - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return node; } + case PM_TOKEN_KEYWORD_CASE: + return parse_case(parser, flags, depth); case PM_TOKEN_KEYWORD_BEGIN: { size_t opening_newline_index = token_newline_index(parser); parser_lex(parser); @@ -19072,15 +19638,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_begin_node_t *begin_node = pm_begin_node_create(parser, &begin_keyword, begin_statements); parse_rescues(parser, opening_newline_index, &begin_keyword, begin_node, PM_RESCUES_BEGIN, (uint16_t) (depth + 1)); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BEGIN_TERM); - - begin_node->base.location.end = parser->previous.end; - pm_begin_node_end_keyword_set(begin_node, &parser->previous); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_BEGIN_TERM, &begin_keyword); + PM_NODE_LENGTH_SET_TOKEN(parser, begin_node, &parser->previous); + pm_begin_node_end_keyword_set(parser, begin_node, &parser->previous); pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) begin_node; + return UP(begin_node); } case PM_TOKEN_KEYWORD_BEGIN_UPCASE: { pm_node_list_t current_block_exits = { 0 }; @@ -19097,16 +19660,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = parser->previous; pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_PREEXE, (uint16_t) (depth + 1)); - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BEGIN_UPCASE_TERM); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_BEGIN_UPCASE_TERM, &opening); pm_context_t context = parser->current_context->context; if ((context != PM_CONTEXT_MAIN) && (context != PM_CONTEXT_PREEXE)) { pm_parser_err_token(parser, &keyword, PM_ERR_BEGIN_UPCASE_TOPLEVEL); } flush_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_pre_execution_node_create(parser, &keyword, &opening, statements, &parser->previous); + return UP(pm_pre_execution_node_create(parser, &keyword, &opening, statements, &parser->previous)); } case PM_TOKEN_KEYWORD_BREAK: case PM_TOKEN_KEYWORD_NEXT: @@ -19123,29 +19684,44 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_binding_power_t binding_power = pm_binding_powers[parser->current.type].left; if (binding_power == PM_BINDING_POWER_UNSET || binding_power >= PM_BINDING_POWER_RANGE) { - parse_arguments(parser, &arguments, false, PM_TOKEN_EOF, (uint16_t) (depth + 1)); + pm_token_t next = parser->current; + parse_arguments(parser, &arguments, false, PM_TOKEN_EOF, flags, (uint16_t) (depth + 1)); + + // Reject `foo && return bar`. + if (!(flags & PM_PARSE_ACCEPTS_COMMAND_CALL) && arguments.arguments != NULL) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &next, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(next.type)); + } + } + + // It's possible that we've parsed a block argument through our + // call to parse_arguments. If we found one, we should mark it + // as invalid and destroy it, as we don't have a place for it. + if (arguments.block != NULL) { + pm_parser_err_node(parser, arguments.block, PM_ERR_UNEXPECTED_BLOCK_ARGUMENT); + pm_node_unreference(parser, arguments.block); + arguments.block = NULL; } } switch (keyword.type) { case PM_TOKEN_KEYWORD_BREAK: { - pm_node_t *node = (pm_node_t *) pm_break_node_create(parser, &keyword, arguments.arguments); + pm_node_t *node = UP(pm_break_node_create(parser, &keyword, arguments.arguments)); if (!parser->partial_script) parse_block_exit(parser, node); return node; } case PM_TOKEN_KEYWORD_NEXT: { - pm_node_t *node = (pm_node_t *) pm_next_node_create(parser, &keyword, arguments.arguments); + pm_node_t *node = UP(pm_next_node_create(parser, &keyword, arguments.arguments)); if (!parser->partial_script) parse_block_exit(parser, node); return node; } case PM_TOKEN_KEYWORD_RETURN: { - pm_node_t *node = (pm_node_t *) pm_return_node_create(parser, &keyword, arguments.arguments); + pm_node_t *node = UP(pm_return_node_create(parser, &keyword, arguments.arguments)); parse_return(parser, node); return node; } default: assert(false && "unreachable"); - return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous))); } } case PM_TOKEN_KEYWORD_SUPER: { @@ -19153,24 +19729,24 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t keyword = parser->previous; pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); if ( - arguments.opening_loc.start == NULL && + arguments.opening_loc.length == 0 && arguments.arguments == NULL && ((arguments.block == NULL) || PM_NODE_TYPE_P(arguments.block, PM_BLOCK_NODE)) ) { - return (pm_node_t *) pm_forwarding_super_node_create(parser, &keyword, &arguments); + return UP(pm_forwarding_super_node_create(parser, &keyword, &arguments)); } - return (pm_node_t *) pm_super_node_create(parser, &keyword, &arguments); + return UP(pm_super_node_create(parser, &keyword, &arguments)); } case PM_TOKEN_KEYWORD_YIELD: { parser_lex(parser); pm_token_t keyword = parser->previous; pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, false, accepts_command_call, (uint16_t) (depth + 1)); + parse_arguments_list(parser, &arguments, false, flags, (uint16_t) (depth + 1)); // It's possible that we've parsed a block argument through our // call to parse_arguments_list. If we found one, we should mark it @@ -19178,434 +19754,25 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // yield node. if (arguments.block != NULL) { pm_parser_err_node(parser, arguments.block, PM_ERR_UNEXPECTED_BLOCK_ARGUMENT); - pm_node_destroy(parser, arguments.block); + pm_node_unreference(parser, arguments.block); arguments.block = NULL; } - pm_node_t *node = (pm_node_t *) pm_yield_node_create(parser, &keyword, &arguments.opening_loc, arguments.arguments, &arguments.closing_loc); + pm_node_t *node = UP(pm_yield_node_create(parser, &keyword, &arguments.opening_loc, arguments.arguments, &arguments.closing_loc)); if (!parser->parsing_eval && !parser->partial_script) parse_yield(parser, node); return node; } - case PM_TOKEN_KEYWORD_CLASS: { - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - - pm_token_t class_keyword = parser->previous; - pm_do_loop_stack_push(parser, false); - - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - if (accept1(parser, PM_TOKEN_LESS_LESS)) { - pm_token_t operator = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_EXPECT_EXPRESSION_AFTER_LESS_LESS, (uint16_t) (depth + 1)); - - pm_parser_scope_push(parser, true); - if (!match2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_SINGLETON_CLASS_DELIMITER, pm_token_type_human(parser->current.type)); - } - - pm_node_t *statements = NULL; - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_SCLASS, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); - } - - if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = (pm_node_t *) parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_SCLASS, (uint16_t) (depth + 1)); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); - } - - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM); - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - pm_do_loop_stack_pop(parser); - - flush_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_singleton_class_node_create(parser, &locals, &class_keyword, &operator, expression, statements, &parser->previous); - } - - pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_CLASS_NAME, (uint16_t) (depth + 1)); - pm_token_t name = parser->previous; - if (name.type != PM_TOKEN_CONSTANT) { - pm_parser_err_token(parser, &name, PM_ERR_CLASS_NAME); - } - - pm_token_t inheritance_operator; - pm_node_t *superclass; - - if (match1(parser, PM_TOKEN_LESS)) { - inheritance_operator = parser->current; - lex_state_set(parser, PM_LEX_STATE_BEG); - - parser->command_start = true; - parser_lex(parser); - - superclass = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CLASS_SUPERCLASS, (uint16_t) (depth + 1)); - } else { - inheritance_operator = not_provided(parser); - superclass = NULL; - } - - pm_parser_scope_push(parser, true); - - if (inheritance_operator.type != PM_TOKEN_NOT_PROVIDED) { - expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CLASS_UNEXPECTED_END); - } else { - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - } - pm_node_t *statements = NULL; - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_CLASS, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); - } - - if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = (pm_node_t *) parse_rescues_implicit_begin(parser, opening_newline_index, &class_keyword, class_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_CLASS, (uint16_t) (depth + 1)); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &class_keyword, false, false); - } - - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_CLASS_TERM); - - if (context_def_p(parser)) { - pm_parser_err_token(parser, &class_keyword, PM_ERR_CLASS_IN_METHOD); - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - pm_do_loop_stack_pop(parser); - - if (!PM_NODE_TYPE_P(constant_path, PM_CONSTANT_PATH_NODE) && !(PM_NODE_TYPE_P(constant_path, PM_CONSTANT_READ_NODE))) { - pm_parser_err_node(parser, constant_path, PM_ERR_CLASS_NAME); - } - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_class_node_create(parser, &locals, &class_keyword, constant_path, &name, &inheritance_operator, superclass, statements, &parser->previous); - } - case PM_TOKEN_KEYWORD_DEF: { - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - pm_token_t def_keyword = parser->current; - size_t opening_newline_index = token_newline_index(parser); - - pm_node_t *receiver = NULL; - pm_token_t operator = not_provided(parser); - pm_token_t name; - - // This context is necessary for lexing `...` in a bare params - // correctly. It must be pushed before lexing the first param, so it - // is here. - context_push(parser, PM_CONTEXT_DEF_PARAMS); - parser_lex(parser); - - // This will be false if the method name is not a valid identifier - // but could be followed by an operator. - bool valid_name = true; - - switch (parser->current.type) { - case PM_CASE_OPERATOR: - pm_parser_scope_push(parser, true); - lex_state_set(parser, PM_LEX_STATE_ENDFN); - parser_lex(parser); - - name = parser->previous; - break; - case PM_TOKEN_IDENTIFIER: { - parser_lex(parser); - - if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { - receiver = parse_variable_call(parser); - - pm_parser_scope_push(parser, true); - lex_state_set(parser, PM_LEX_STATE_FNAME); - parser_lex(parser); - - operator = parser->previous; - name = parse_method_definition_name(parser); - } else { - pm_refute_numbered_parameter(parser, parser->previous.start, parser->previous.end); - pm_parser_scope_push(parser, true); - - name = parser->previous; - } - - break; - } - case PM_TOKEN_INSTANCE_VARIABLE: - case PM_TOKEN_CLASS_VARIABLE: - case PM_TOKEN_GLOBAL_VARIABLE: - valid_name = false; - PRISM_FALLTHROUGH - case PM_TOKEN_CONSTANT: - case PM_TOKEN_KEYWORD_NIL: - case PM_TOKEN_KEYWORD_SELF: - case PM_TOKEN_KEYWORD_TRUE: - case PM_TOKEN_KEYWORD_FALSE: - case PM_TOKEN_KEYWORD___FILE__: - case PM_TOKEN_KEYWORD___LINE__: - case PM_TOKEN_KEYWORD___ENCODING__: { - pm_parser_scope_push(parser, true); - parser_lex(parser); - - pm_token_t identifier = parser->previous; - - if (match2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON)) { - lex_state_set(parser, PM_LEX_STATE_FNAME); - parser_lex(parser); - operator = parser->previous; - - switch (identifier.type) { - case PM_TOKEN_CONSTANT: - receiver = (pm_node_t *) pm_constant_read_node_create(parser, &identifier); - break; - case PM_TOKEN_INSTANCE_VARIABLE: - receiver = (pm_node_t *) pm_instance_variable_read_node_create(parser, &identifier); - break; - case PM_TOKEN_CLASS_VARIABLE: - receiver = (pm_node_t *) pm_class_variable_read_node_create(parser, &identifier); - break; - case PM_TOKEN_GLOBAL_VARIABLE: - receiver = (pm_node_t *) pm_global_variable_read_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD_NIL: - receiver = (pm_node_t *) pm_nil_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD_SELF: - receiver = (pm_node_t *) pm_self_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD_TRUE: - receiver = (pm_node_t *) pm_true_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD_FALSE: - receiver = (pm_node_t *) pm_false_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD___FILE__: - receiver = (pm_node_t *) pm_source_file_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD___LINE__: - receiver = (pm_node_t *) pm_source_line_node_create(parser, &identifier); - break; - case PM_TOKEN_KEYWORD___ENCODING__: - receiver = (pm_node_t *) pm_source_encoding_node_create(parser, &identifier); - break; - default: - break; - } - - name = parse_method_definition_name(parser); - } else { - if (!valid_name) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, identifier, PM_ERR_DEF_NAME, pm_token_type_human(identifier.type)); - } - - name = identifier; - } - break; - } - case PM_TOKEN_PARENTHESIS_LEFT: { - // The current context is `PM_CONTEXT_DEF_PARAMS`, however - // the inner expression of this parenthesis should not be - // processed under this context. Thus, the context is popped - // here. - context_pop(parser); - parser_lex(parser); - - pm_token_t lparen = parser->previous; - pm_node_t *expression = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEF_RECEIVER, (uint16_t) (depth + 1)); - - accept1(parser, PM_TOKEN_NEWLINE); - expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - pm_token_t rparen = parser->previous; - - lex_state_set(parser, PM_LEX_STATE_FNAME); - expect2(parser, PM_TOKEN_DOT, PM_TOKEN_COLON_COLON, PM_ERR_DEF_RECEIVER_TERM); - - operator = parser->previous; - receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, expression, &rparen, 0); - - // To push `PM_CONTEXT_DEF_PARAMS` again is for the same - // reason as described the above. - pm_parser_scope_push(parser, true); - context_push(parser, PM_CONTEXT_DEF_PARAMS); - name = parse_method_definition_name(parser); - break; - } - default: - pm_parser_scope_push(parser, true); - name = parse_method_definition_name(parser); - break; - } - - pm_token_t lparen; - pm_token_t rparen; - pm_parameters_node_t *params; - - switch (parser->current.type) { - case PM_TOKEN_PARENTHESIS_LEFT: { - parser_lex(parser); - lparen = parser->previous; - - if (match1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - params = NULL; - } else { - params = parse_parameters(parser, PM_BINDING_POWER_DEFINED, true, false, true, true, false, (uint16_t) (depth + 1)); - } - - lex_state_set(parser, PM_LEX_STATE_BEG); - parser->command_start = true; - - context_pop(parser); - if (!accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_DEF_PARAMS_TERM_PAREN, pm_token_type_human(parser->current.type)); - parser->previous.start = parser->previous.end; - parser->previous.type = PM_TOKEN_MISSING; - } - - rparen = parser->previous; - break; - } - case PM_CASE_PARAMETER: { - // If we're about to lex a label, we need to add the label - // state to make sure the next newline is ignored. - if (parser->current.type == PM_TOKEN_LABEL) { - lex_state_set(parser, parser->lex_state | PM_LEX_STATE_LABEL); - } - - lparen = not_provided(parser); - rparen = not_provided(parser); - params = parse_parameters(parser, PM_BINDING_POWER_DEFINED, false, false, true, true, false, (uint16_t) (depth + 1)); - - context_pop(parser); - break; - } - default: { - lparen = not_provided(parser); - rparen = not_provided(parser); - params = NULL; - - context_pop(parser); - break; - } - } - - pm_node_t *statements = NULL; - pm_token_t equal; - pm_token_t end_keyword; - - if (accept1(parser, PM_TOKEN_EQUAL)) { - if (token_is_setter_name(&name)) { - pm_parser_err_token(parser, &name, PM_ERR_DEF_ENDLESS_SETTER); - } - equal = parser->previous; - - context_push(parser, PM_CONTEXT_DEF); - pm_do_loop_stack_push(parser, false); - statements = (pm_node_t *) pm_statements_node_create(parser); - - pm_node_t *statement = parse_expression(parser, PM_BINDING_POWER_DEFINED + 1, binding_power < PM_BINDING_POWER_COMPOSITION, false, PM_ERR_DEF_ENDLESS, (uint16_t) (depth + 1)); - - if (accept1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { - context_push(parser, PM_CONTEXT_RESCUE_MODIFIER); - - pm_token_t rescue_keyword = parser->previous; - pm_node_t *value = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); - context_pop(parser); - - statement = (pm_node_t *) pm_rescue_modifier_node_create(parser, statement, &rescue_keyword, value); - } - - pm_statements_node_body_append(parser, (pm_statements_node_t *) statements, statement, false); - pm_do_loop_stack_pop(parser); - context_pop(parser); - end_keyword = not_provided(parser); - } else { - equal = not_provided(parser); - - if (lparen.type == PM_TOKEN_NOT_PROVIDED) { - lex_state_set(parser, PM_LEX_STATE_BEG); - parser->command_start = true; - expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_DEF_PARAMS_TERM); - } else { - accept2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON); - } - - pm_accepts_block_stack_push(parser, true); - pm_do_loop_stack_push(parser, false); - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_DEF, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); - } - - if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = (pm_node_t *) parse_rescues_implicit_begin(parser, opening_newline_index, &def_keyword, def_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_DEF, (uint16_t) (depth + 1)); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &def_keyword, false, false); - } - - pm_accepts_block_stack_pop(parser); - pm_do_loop_stack_pop(parser); - - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_DEF_TERM); - end_keyword = parser->previous; - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - pm_parser_scope_pop(parser); - - /** - * If the final character is @. As is the case when defining - * methods to override the unary operators, we should ignore - * the @ in the same way we do for symbols. - */ - pm_constant_id_t name_id = pm_parser_constant_id_location(parser, name.start, parse_operator_symbol_name(&name)); - - flush_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_def_node_create( - parser, - name_id, - &name, - receiver, - params, - statements, - &locals, - &def_keyword, - &operator, - &lparen, - &rparen, - &equal, - &end_keyword - ); - } + case PM_TOKEN_KEYWORD_CLASS: + return parse_class(parser, flags, depth); + case PM_TOKEN_KEYWORD_DEF: + return parse_def(parser, binding_power, flags, depth); case PM_TOKEN_KEYWORD_DEFINED: { parser_lex(parser); - pm_token_t keyword = parser->previous; - pm_token_t lparen; - pm_token_t rparen; + pm_token_t keyword = parser->previous; + pm_token_t lparen = { 0 }; + pm_token_t rparen = { 0 }; pm_node_t *expression; context_push(parser, PM_CONTEXT_DEFINED); @@ -19615,34 +19782,29 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b lparen = parser->previous; if (newline && accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - expression = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0); - lparen = not_provided(parser); - rparen = not_provided(parser); + expression = UP(pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0)); + lparen = (pm_token_t) { 0 }; } else { - expression = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); + expression = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); - if (parser->recovering) { - rparen = not_provided(parser); - } else { + if (!parser->recovering) { accept1(parser, PM_TOKEN_NEWLINE); expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); rparen = parser->previous; } } } else { - lparen = not_provided(parser); - rparen = not_provided(parser); - expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); + expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_DEFINED_EXPRESSION, (uint16_t) (depth + 1)); } context_pop(parser); - return (pm_node_t *) pm_defined_node_create( + return UP(pm_defined_node_create( parser, - &lparen, + NTOK2PTR(lparen), expression, - &rparen, - &PM_LOCATION_TOKEN_VALUE(&keyword) - ); + NTOK2PTR(rparen), + &keyword + )); } case PM_TOKEN_KEYWORD_END_UPCASE: { if (binding_power != PM_BINDING_POWER_STATEMENT) { @@ -19660,12 +19822,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = parser->previous; pm_statements_node_t *statements = parse_statements(parser, PM_CONTEXT_POSTEXE, (uint16_t) (depth + 1)); - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_END_UPCASE_TERM); - return (pm_node_t *) pm_post_execution_node_create(parser, &keyword, &opening, statements, &parser->previous); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_END_UPCASE_TERM, &opening); + return UP(pm_post_execution_node_create(parser, &keyword, &opening, statements, &parser->previous)); } case PM_TOKEN_KEYWORD_FALSE: parser_lex(parser); - return (pm_node_t *) pm_false_node_create(parser, &parser->previous); + return UP(pm_false_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD_FOR: { size_t opening_newline_index = token_newline_index(parser); parser_lex(parser); @@ -19681,15 +19843,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_node_t *name = NULL; if (token_begins_expression_p(parser->current.type)) { - name = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + name = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); } - index = (pm_node_t *) pm_splat_node_create(parser, &star_operator, name); + index = UP(pm_splat_node_create(parser, &star_operator, name)); } else if (token_begins_expression_p(parser->current.type)) { - index = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); + index = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_COMMA, (uint16_t) (depth + 1)); } else { pm_parser_err_token(parser, &for_keyword, PM_ERR_FOR_INDEX); - index = (pm_node_t *) pm_missing_node_create(parser, for_keyword.start, for_keyword.end); + index = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &for_keyword), PM_TOKEN_LENGTH(&for_keyword))); } // Now, if there are multiple index expressions, parse them out. @@ -19705,16 +19867,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b expect1(parser, PM_TOKEN_KEYWORD_IN, PM_ERR_FOR_IN); pm_token_t in_keyword = parser->previous; - pm_node_t *collection = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_FOR_COLLECTION, (uint16_t) (depth + 1)); + pm_node_t *collection = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_FOR_COLLECTION, (uint16_t) (depth + 1)); pm_do_loop_stack_pop(parser); - pm_token_t do_keyword; + pm_token_t do_keyword = { 0 }; if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { do_keyword = parser->previous; } else { - do_keyword = not_provided(parser); if (!match2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_FOR_DELIMITER, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_FOR_DELIMITER, pm_token_str(parser->current.type)); } } @@ -19724,13 +19885,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } parser_warn_indentation_mismatch(parser, opening_newline_index, &for_keyword, false, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_FOR_TERM); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_FOR_TERM, &for_keyword); - return (pm_node_t *) pm_for_node_create(parser, index, collection, statements, &for_keyword, &in_keyword, &do_keyword, &parser->previous); + return UP(pm_for_node_create(parser, index, collection, statements, &for_keyword, &in_keyword, NTOK2PTR(do_keyword), &parser->previous)); } case PM_TOKEN_KEYWORD_IF: if (parser_end_of_line_p(parser)) { - PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_KEYWORD_EOL); + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, &parser->current, PM_WARN_KEYWORD_EOL); } size_t opening_newline_index = token_newline_index(parser); @@ -19747,26 +19908,24 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_undef_node_t *undef = pm_undef_node_create(parser, &parser->previous); pm_node_t *name = parse_undef_argument(parser, (uint16_t) (depth + 1)); - if (PM_NODE_TYPE_P(name, PM_MISSING_NODE)) { - pm_node_destroy(parser, name); + if (PM_NODE_TYPE_P(name, PM_ERROR_RECOVERY_NODE)) { } else { - pm_undef_node_append(undef, name); + pm_undef_node_append(parser->arena, undef, name); while (match1(parser, PM_TOKEN_COMMA)) { lex_state_set(parser, PM_LEX_STATE_FNAME | PM_LEX_STATE_FITEM); parser_lex(parser); name = parse_undef_argument(parser, (uint16_t) (depth + 1)); - if (PM_NODE_TYPE_P(name, PM_MISSING_NODE)) { - pm_node_destroy(parser, name); + if (PM_NODE_TYPE_P(name, PM_ERROR_RECOVERY_NODE)) { break; } - pm_undef_node_append(undef, name); + pm_undef_node_append(parser->arena, undef, name); } } - return (pm_node_t *) undef; + return UP(undef); } case PM_TOKEN_KEYWORD_NOT: { parser_lex(parser); @@ -19775,28 +19934,46 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_arguments_t arguments = { 0 }; pm_node_t *receiver = NULL; + // The `not` keyword without parentheses is only valid in contexts + // where it would be parsed as an expression (i.e., at or below + // the `not` binding power level). In other contexts (e.g., method + // arguments, array elements, assignment right-hand sides), + // parentheses are required: `not(x)`. An exception is made for + // endless def bodies, where `not` is valid as both `arg` and + // `command` (e.g., `def f = not 1`, `def f = not foo bar`). + if (binding_power > PM_BINDING_POWER_NOT && !(flags & PM_PARSE_IN_ENDLESS_DEF) && !match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { + if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT_PARENTHESES)) { + pm_parser_err(parser, PM_TOKEN_END(parser, &parser->previous), 1, PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN); + } else { + accept1(parser, PM_TOKEN_NEWLINE); + pm_parser_err_current(parser, PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER); + } + + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); + } + accept1(parser, PM_TOKEN_NEWLINE); if (accept1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { pm_token_t lparen = parser->previous; if (accept1(parser, PM_TOKEN_PARENTHESIS_RIGHT)) { - receiver = (pm_node_t *) pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0); + receiver = UP(pm_parentheses_node_create(parser, &lparen, NULL, &parser->previous, 0)); } else { - arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&lparen); - receiver = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); + arguments.opening_loc = TOK2LOC(parser, &lparen); + receiver = parse_expression(parser, PM_BINDING_POWER_COMPOSITION, PM_PARSE_ACCEPTS_COMMAND_CALL | PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); if (!parser->recovering) { accept1(parser, PM_TOKEN_NEWLINE); expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments.closing_loc = TOK2LOC(parser, &parser->previous); } } } else { - receiver = parse_expression(parser, PM_BINDING_POWER_NOT, true, false, PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); + receiver = parse_expression(parser, PM_BINDING_POWER_NOT, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_NOT_EXPRESSION, (uint16_t) (depth + 1)); } - return (pm_node_t *) pm_call_node_not_create(parser, receiver, &message, &arguments); + return UP(pm_call_node_not_create(parser, receiver, &message, &arguments)); } case PM_TOKEN_KEYWORD_UNLESS: { size_t opening_newline_index = token_newline_index(parser); @@ -19804,81 +19981,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b return parse_conditional(parser, PM_CONTEXT_UNLESS, opening_newline_index, false, (uint16_t) (depth + 1)); } - case PM_TOKEN_KEYWORD_MODULE: { - pm_node_list_t current_block_exits = { 0 }; - pm_node_list_t *previous_block_exits = push_block_exits(parser, ¤t_block_exits); - - size_t opening_newline_index = token_newline_index(parser); - parser_lex(parser); - pm_token_t module_keyword = parser->previous; - - pm_node_t *constant_path = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_MODULE_NAME, (uint16_t) (depth + 1)); - pm_token_t name; - - // If we can recover from a syntax error that occurred while parsing - // the name of the module, then we'll handle that here. - if (PM_NODE_TYPE_P(constant_path, PM_MISSING_NODE)) { - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - pm_token_t missing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; - return (pm_node_t *) pm_module_node_create(parser, NULL, &module_keyword, constant_path, &missing, NULL, &missing); - } - - while (accept1(parser, PM_TOKEN_COLON_COLON)) { - pm_token_t double_colon = parser->previous; - - expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); - constant_path = (pm_node_t *) pm_constant_path_node_create(parser, constant_path, &double_colon, &parser->previous); - } - - // Here we retrieve the name of the module. If it wasn't a constant, - // then it's possible that `module foo` was passed, which is a - // syntax error. We handle that here as well. - name = parser->previous; - if (name.type != PM_TOKEN_CONSTANT) { - pm_parser_err_token(parser, &name, PM_ERR_MODULE_NAME); - } - - pm_parser_scope_push(parser, true); - accept2(parser, PM_TOKEN_SEMICOLON, PM_TOKEN_NEWLINE); - pm_node_t *statements = NULL; - - if (!match4(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE, PM_TOKEN_KEYWORD_END)) { - pm_accepts_block_stack_push(parser, true); - statements = (pm_node_t *) parse_statements(parser, PM_CONTEXT_MODULE, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); - } - - if (match3(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE, PM_TOKEN_KEYWORD_ELSE)) { - assert(statements == NULL || PM_NODE_TYPE_P(statements, PM_STATEMENTS_NODE)); - statements = (pm_node_t *) parse_rescues_implicit_begin(parser, opening_newline_index, &module_keyword, module_keyword.start, (pm_statements_node_t *) statements, PM_RESCUES_MODULE, (uint16_t) (depth + 1)); - } else { - parser_warn_indentation_mismatch(parser, opening_newline_index, &module_keyword, false, false); - } - - pm_constant_id_list_t locals; - pm_locals_order(parser, &parser->current_scope->locals, &locals, false); - - pm_parser_scope_pop(parser); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_MODULE_TERM); - - if (context_def_p(parser)) { - pm_parser_err_token(parser, &module_keyword, PM_ERR_MODULE_IN_METHOD); - } - - pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_module_node_create(parser, &locals, &module_keyword, constant_path, &name, statements, &parser->previous); - } + case PM_TOKEN_KEYWORD_MODULE: + return parse_module(parser, flags, depth); case PM_TOKEN_KEYWORD_NIL: parser_lex(parser); - return (pm_node_t *) pm_nil_node_create(parser, &parser->previous); + return UP(pm_nil_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD_REDO: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_redo_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_redo_node_create(parser, &parser->previous)); if (!parser->partial_script) parse_block_exit(parser, node); return node; @@ -19886,17 +19997,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b case PM_TOKEN_KEYWORD_RETRY: { parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_retry_node_create(parser, &parser->previous); + pm_node_t *node = UP(pm_retry_node_create(parser, &parser->previous)); parse_retry(parser, node); return node; } case PM_TOKEN_KEYWORD_SELF: parser_lex(parser); - return (pm_node_t *) pm_self_node_create(parser, &parser->previous); + return UP(pm_self_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD_TRUE: parser_lex(parser); - return (pm_node_t *) pm_true_node_create(parser, &parser->previous); + return UP(pm_true_node_create(parser, &parser->previous)); case PM_TOKEN_KEYWORD_UNTIL: { size_t opening_newline_index = token_newline_index(parser); @@ -19905,16 +20016,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); pm_do_loop_stack_pop(parser); context_pop(parser); - pm_token_t do_keyword; + pm_token_t do_keyword = { 0 }; if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { do_keyword = parser->previous; } else { - do_keyword = not_provided(parser); expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CONDITIONAL_UNTIL_PREDICATE); } @@ -19927,9 +20037,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_UNTIL_TERM); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_UNTIL_TERM, &keyword); - return (pm_node_t *) pm_until_node_create(parser, &keyword, &do_keyword, &parser->previous, predicate, statements, 0); + return UP(pm_until_node_create(parser, &keyword, NTOK2PTR(do_keyword), &parser->previous, predicate, statements, 0)); } case PM_TOKEN_KEYWORD_WHILE: { size_t opening_newline_index = token_newline_index(parser); @@ -19939,16 +20049,15 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t keyword = parser->previous; - pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, true, false, PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); + pm_node_t *predicate = parse_value_expression(parser, PM_BINDING_POWER_COMPOSITION, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); pm_do_loop_stack_pop(parser); context_pop(parser); - pm_token_t do_keyword; + pm_token_t do_keyword = { 0 }; if (accept1(parser, PM_TOKEN_KEYWORD_DO_LOOP)) { do_keyword = parser->previous; } else { - do_keyword = not_provided(parser); expect2(parser, PM_TOKEN_NEWLINE, PM_TOKEN_SEMICOLON, PM_ERR_CONDITIONAL_WHILE_PREDICATE); } @@ -19961,381 +20070,122 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b } parser_warn_indentation_mismatch(parser, opening_newline_index, &keyword, false, false); - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_WHILE_TERM); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_WHILE_TERM, &keyword); - return (pm_node_t *) pm_while_node_create(parser, &keyword, &do_keyword, &parser->previous, predicate, statements, 0); + return UP(pm_while_node_create(parser, &keyword, NTOK2PTR(do_keyword), &parser->previous, predicate, statements, 0)); } case PM_TOKEN_PERCENT_LOWER_I: { parser_lex(parser); pm_token_t opening = parser->previous; pm_array_node_t *array = pm_array_node_create(parser, &opening); + pm_node_t *current = NULL; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { accept1(parser, PM_TOKEN_WORDS_SEP); if (match1(parser, PM_TOKEN_STRING_END)) break; - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_array_node_elements_append(array, (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing)); - } - - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT); - } - - pm_token_t closing = parser->current; - if (match1(parser, PM_TOKEN_EOF)) { - pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_LOWER_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; - } else { - expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_LOWER_TERM); - } - pm_array_node_close_set(array, &closing); - - return (pm_node_t *) array; - } - case PM_TOKEN_PERCENT_UPPER_I: { - parser_lex(parser); - pm_token_t opening = parser->previous; - pm_array_node_t *array = pm_array_node_create(parser, &opening); - - // This is the current node that we are parsing that will be added to the - // list of elements. - pm_node_t *current = NULL; - - while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - switch (parser->current.type) { - case PM_TOKEN_WORDS_SEP: { - if (current == NULL) { - // If we hit a separator before we have any content, then we don't - // need to do anything. - } else { - // If we hit a separator after we've hit content, then we need to - // append that content to the list and reset the current node. - pm_array_node_elements_append(array, current); - current = NULL; - } - + // Interpolation is not possible but nested heredocs can still lead to + // consecutive (disjoint) string tokens when the final newline is escaped. + while (match1(parser, PM_TOKEN_STRING_CONTENT)) { + // Record the string node, moving to interpolation if needed. + if (current == NULL) { + current = UP(pm_symbol_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + parser_lex(parser); + pm_interpolated_symbol_node_append(parser->arena, (pm_interpolated_symbol_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { + pm_symbol_node_t *cast = (pm_symbol_node_t *) current; + pm_token_t content = { .type = PM_TOKEN_STRING_CONTENT, .start = parser->start + cast->value_loc.start, .end = parser->start + cast->value_loc.start + cast->value_loc.length }; + pm_node_t *first_string = UP(pm_string_node_create_unescaped(parser, NULL, &content, NULL, &cast->unescaped)); + pm_node_t *second_string = UP(pm_string_node_create_current_string(parser, NULL, &parser->previous, NULL)); parser_lex(parser); - break; - } - case PM_TOKEN_STRING_CONTENT: { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - if (current == NULL) { - // If we hit content and the current node is NULL, then this is - // the first string content we've seen. In that case we're going - // to create a new string node and set that to the current. - current = (pm_node_t *) pm_symbol_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { - // If we hit string content and the current node is an - // interpolated string, then we need to append the string content - // to the list of child nodes. - pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - parser_lex(parser); - - pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, string); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit string content and the current node is a symbol node, - // then we need to convert the current node into an interpolated - // string and add the string content to the list of child nodes. - pm_symbol_node_t *cast = (pm_symbol_node_t *) current; - pm_token_t bounds = not_provided(parser); - - pm_token_t content = { .type = PM_TOKEN_STRING_CONTENT, .start = cast->value_loc.start, .end = cast->value_loc.end }; - pm_node_t *first_string = (pm_node_t *) pm_string_node_create_unescaped(parser, &bounds, &content, &bounds, &cast->unescaped); - pm_node_t *second_string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->previous, &closing); - parser_lex(parser); - - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); - pm_interpolated_symbol_node_append(interpolated, first_string); - pm_interpolated_symbol_node_append(interpolated, second_string); - - xfree(current); - current = (pm_node_t *) interpolated; - } else { - assert(false && "unreachable"); - } - - break; - } - case PM_TOKEN_EMBVAR: { - bool start_location_set = false; - if (current == NULL) { - // If we hit an embedded variable and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - current = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit an embedded variable and the current node is a string - // node, then we'll convert the current into an interpolated - // string and add the string node to the list of parts. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); - - current = (pm_node_t *) pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current); - pm_interpolated_symbol_node_append(interpolated, current); - interpolated->base.location.start = current->location.start; - start_location_set = true; - current = (pm_node_t *) interpolated; - } else { - // If we hit an embedded variable and the current node is an - // interpolated string, then we'll just add the embedded variable. - } - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part); - if (!start_location_set) { - current->location.start = part->location.start; - } - break; - } - case PM_TOKEN_EMBEXPR_BEGIN: { - bool start_location_set = false; - if (current == NULL) { - // If we hit an embedded expression and the current node is NULL, - // then this is the start of a new string. We'll set the current - // node to a new interpolated string. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - current = (pm_node_t *) pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); - } else if (PM_NODE_TYPE_P(current, PM_SYMBOL_NODE)) { - // If we hit an embedded expression and the current node is a - // string node, then we'll convert the current into an - // interpolated string and add the string node to the list of - // parts. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, &opening, NULL, &closing); - - current = (pm_node_t *) pm_symbol_node_to_string_node(parser, (pm_symbol_node_t *) current); - pm_interpolated_symbol_node_append(interpolated, current); - interpolated->base.location.start = current->location.start; - start_location_set = true; - current = (pm_node_t *) interpolated; - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_SYMBOL_NODE)) { - // If we hit an embedded expression and the current node is an - // interpolated string, then we'll just continue on. - } else { - assert(false && "unreachable"); - } + pm_interpolated_symbol_node_t *interpolated = pm_interpolated_symbol_node_create(parser, NULL, NULL, NULL); + pm_interpolated_symbol_node_append(parser->arena, interpolated, first_string); + pm_interpolated_symbol_node_append(parser->arena, interpolated, second_string); - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_symbol_node_append((pm_interpolated_symbol_node_t *) current, part); - if (!start_location_set) { - current->location.start = part->location.start; - } - break; + // current is arena-allocated so no explicit free is needed. + current = UP(interpolated); + } else { + assert(false && "unreachable"); } - default: - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_UPPER_ELEMENT); - parser_lex(parser); - break; } - } - // If we have a current node, then we need to append it to the list. - if (current) { - pm_array_node_elements_append(array, current); + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } else { + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_I_LOWER_ELEMENT); + } } pm_token_t closing = parser->current; if (match1(parser, PM_TOKEN_EOF)) { - pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_UPPER_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + pm_parser_err_token(parser, &opening, PM_ERR_LIST_I_LOWER_TERM); + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; } else { - expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_UPPER_TERM); + expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_I_LOWER_TERM); } - pm_array_node_close_set(array, &closing); + pm_array_node_close_set(parser, array, &closing); - return (pm_node_t *) array; + return UP(array); } + case PM_TOKEN_PERCENT_UPPER_I: + return parse_symbol_array(parser, depth); case PM_TOKEN_PERCENT_LOWER_W: { parser_lex(parser); pm_token_t opening = parser->previous; pm_array_node_t *array = pm_array_node_create(parser, &opening); - - // skip all leading whitespaces - accept1(parser, PM_TOKEN_WORDS_SEP); + pm_node_t *current = NULL; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { accept1(parser, PM_TOKEN_WORDS_SEP); if (match1(parser, PM_TOKEN_STRING_END)) break; - if (match1(parser, PM_TOKEN_STRING_CONTENT)) { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - pm_array_node_elements_append(array, string); + // Interpolation is not possible but nested heredocs can still lead to + // consecutive (disjoint) string tokens when the final newline is escaped. + while (match1(parser, PM_TOKEN_STRING_CONTENT)) { + pm_node_t *string = UP(pm_string_node_create_current_string(parser, NULL, &parser->current, NULL)); + + // Record the string node, moving to interpolation if needed. + if (current == NULL) { + current = string; + } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { + pm_interpolated_string_node_append(parser, (pm_interpolated_string_node_t *) current, string); + } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { + pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, NULL, NULL, NULL); + pm_interpolated_string_node_append(parser, interpolated, current); + pm_interpolated_string_node_append(parser, interpolated, string); + current = UP(interpolated); + } else { + assert(false && "unreachable"); + } + parser_lex(parser); } - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT); + if (current) { + pm_array_node_elements_append(parser->arena, array, current); + current = NULL; + } else { + expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_LOWER_ELEMENT); + } } pm_token_t closing = parser->current; if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_LOWER_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_LOWER_TERM); } - pm_array_node_close_set(array, &closing); - return (pm_node_t *) array; - } - case PM_TOKEN_PERCENT_UPPER_W: { - parser_lex(parser); - pm_token_t opening = parser->previous; - pm_array_node_t *array = pm_array_node_create(parser, &opening); - - // This is the current node that we are parsing that will be added - // to the list of elements. - pm_node_t *current = NULL; - - while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { - switch (parser->current.type) { - case PM_TOKEN_WORDS_SEP: { - // Reset the explicit encoding if we hit a separator - // since each element can have its own encoding. - parser->explicit_encoding = NULL; - - if (current == NULL) { - // If we hit a separator before we have any content, - // then we don't need to do anything. - } else { - // If we hit a separator after we've hit content, - // then we need to append that content to the list - // and reset the current node. - pm_array_node_elements_append(array, current); - current = NULL; - } - - parser_lex(parser); - break; - } - case PM_TOKEN_STRING_CONTENT: { - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - pm_node_t *string = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing); - pm_node_flag_set(string, parse_unescaped_encoding(parser)); - parser_lex(parser); - - if (current == NULL) { - // If we hit content and the current node is NULL, - // then this is the first string content we've seen. - // In that case we're going to create a new string - // node and set that to the current. - current = string; - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit string content and the current node is - // an interpolated string, then we need to append - // the string content to the list of child nodes. - pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, string); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit string content and the current node is - // a string node, then we need to convert the - // current node into an interpolated string and add - // the string content to the list of child nodes. - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); - pm_interpolated_string_node_append(interpolated, current); - pm_interpolated_string_node_append(interpolated, string); - current = (pm_node_t *) interpolated; - } else { - assert(false && "unreachable"); - } - - break; - } - case PM_TOKEN_EMBVAR: { - if (current == NULL) { - // If we hit an embedded variable and the current - // node is NULL, then this is the start of a new - // string. We'll set the current node to a new - // interpolated string. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded variable and the current - // node is a string node, then we'll convert the - // current into an interpolated string and add the - // string node to the list of parts. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); - pm_interpolated_string_node_append(interpolated, current); - current = (pm_node_t *) interpolated; - } else { - // If we hit an embedded variable and the current - // node is an interpolated string, then we'll just - // add the embedded variable. - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part); - break; - } - case PM_TOKEN_EMBEXPR_BEGIN: { - if (current == NULL) { - // If we hit an embedded expression and the current - // node is NULL, then this is the start of a new - // string. We'll set the current node to a new - // interpolated string. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - current = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, NULL, &closing); - } else if (PM_NODE_TYPE_P(current, PM_STRING_NODE)) { - // If we hit an embedded expression and the current - // node is a string node, then we'll convert the - // current into an interpolated string and add the - // string node to the list of parts. - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_interpolated_string_node_t *interpolated = pm_interpolated_string_node_create(parser, &opening, NULL, &closing); - pm_interpolated_string_node_append(interpolated, current); - current = (pm_node_t *) interpolated; - } else if (PM_NODE_TYPE_P(current, PM_INTERPOLATED_STRING_NODE)) { - // If we hit an embedded expression and the current - // node is an interpolated string, then we'll just - // continue on. - } else { - assert(false && "unreachable"); - } - - pm_node_t *part = parse_string_part(parser, (uint16_t) (depth + 1)); - pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, part); - break; - } - default: - expect1(parser, PM_TOKEN_STRING_CONTENT, PM_ERR_LIST_W_UPPER_ELEMENT); - parser_lex(parser); - break; - } - } - - // If we have a current node, then we need to append it to the list. - if (current) { - pm_array_node_elements_append(array, current); - } - - pm_token_t closing = parser->current; - if (match1(parser, PM_TOKEN_EOF)) { - pm_parser_err_token(parser, &opening, PM_ERR_LIST_W_UPPER_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; - } else { - expect1(parser, PM_TOKEN_STRING_END, PM_ERR_LIST_W_UPPER_TERM); - } - - pm_array_node_close_set(array, &closing); - return (pm_node_t *) array; + pm_array_node_close_set(parser, array, &closing); + return UP(array); } + case PM_TOKEN_PERCENT_UPPER_W: + return parse_string_array(parser, depth); case PM_TOKEN_REGEXP_BEGIN: { pm_token_t opening = parser->current; parser_lex(parser); @@ -20352,10 +20202,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); - pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); - pm_node_flag_set(node, PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING); - - return node; + pm_regular_expression_node_t *node = pm_regular_expression_node_create(parser, &opening, &content, &parser->previous); + pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL)); + return UP(node); } pm_interpolated_regular_expression_node_t *interpolated; @@ -20367,7 +20216,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // regular expression) or if it's not then it has interpolation. pm_string_t unescaped = parser->current_string; pm_token_t content = parser->current; - bool ascii_only = parser->current_regular_expression_ascii_only; parser_lex(parser); // If we hit an end, then we can create a regular expression @@ -20376,26 +20224,22 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b if (accept1(parser, PM_TOKEN_REGEXP_END)) { pm_regular_expression_node_t *node = (pm_regular_expression_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); - // If we're not immediately followed by a =~, then we want - // to parse all of the errors at this point. If it is - // followed by a =~, then it will get parsed higher up while - // parsing the named captures as well. + // If we're not immediately followed by a =~, then we + // parse and validate now. If it is followed by a =~, + // then it will get parsed in the =~ handler where + // named captures can also be extracted. if (!match1(parser, PM_TOKEN_EQUAL_TILDE)) { - parse_regular_expression_errors(parser, node); + pm_node_flag_set(UP(node), pm_regexp_parse(parser, node, NULL, NULL)); } - pm_node_flag_set((pm_node_t *) node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->base.flags)); - return (pm_node_t *) node; + return UP(node); } // If we get here, then we have interpolation so we'll need to create // a regular expression node with interpolation. interpolated = pm_interpolated_regular_expression_node_create(parser, &opening); - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); - + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, NULL, &parser->previous, NULL, &unescaped)); if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { // This is extremely strange, but the first string part of a // regular expression will always be tagged as binary if we @@ -20403,7 +20247,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_node_flag_set(part, PM_STRING_FLAGS_FORCED_BINARY_ENCODING); } - pm_interpolated_regular_expression_node_append(interpolated, part); + pm_interpolated_regular_expression_node_append(parser->arena, interpolated, part); } else { // If the first part of the body of the regular expression is not a // string content, then we have interpolation and we need to create an @@ -20416,20 +20260,20 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_node_t *part; while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_interpolated_regular_expression_node_append(interpolated, part); + pm_interpolated_regular_expression_node_append(parser->arena, interpolated, part); } } pm_token_t closing = parser->current; if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_REGEXP_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; } else { expect1(parser, PM_TOKEN_REGEXP_END, PM_ERR_REGEXP_TERM); } pm_interpolated_regular_expression_node_closing_set(parser, interpolated, &closing); - return (pm_node_t *) interpolated; + return UP(interpolated); } case PM_TOKEN_BACKTICK: case PM_TOKEN_PERCENT_LOWER_X: { @@ -20451,7 +20295,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b }; parser_lex(parser); - return (pm_node_t *) pm_xstring_node_create(parser, &opening, &content, &parser->previous); + return UP(pm_xstring_node_create(parser, &opening, &content, &parser->previous)); } pm_interpolated_x_string_node_t *node; @@ -20466,7 +20310,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); if (match1(parser, PM_TOKEN_STRING_END)) { - pm_node_t *node = (pm_node_t *) pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); + pm_node_t *node = UP(pm_xstring_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped)); pm_node_flag_set(node, parse_unescaped_encoding(parser)); parser_lex(parser); return node; @@ -20476,13 +20320,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // create a string node with interpolation. node = pm_interpolated_xstring_node_create(parser, &opening, &opening); - pm_token_t opening = not_provided(parser); - pm_token_t closing = not_provided(parser); - - pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); + pm_node_t *part = UP(pm_string_node_create_unescaped(parser, NULL, &parser->previous, NULL, &unescaped)); pm_node_flag_set(part, parse_unescaped_encoding(parser)); - pm_interpolated_xstring_node_append(node, part); + pm_interpolated_xstring_node_append(parser->arena, node, part); } else { // If the first part of the body of the string is not a string // content, then we have interpolation and we need to create an @@ -20493,20 +20334,20 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_node_t *part; while (!match2(parser, PM_TOKEN_STRING_END, PM_TOKEN_EOF)) { if ((part = parse_string_part(parser, (uint16_t) (depth + 1))) != NULL) { - pm_interpolated_xstring_node_append(node, part); + pm_interpolated_xstring_node_append(parser->arena, node, part); } } pm_token_t closing = parser->current; if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_XSTRING_TERM); - closing = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + closing = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; } else { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_XSTRING_TERM); } - pm_interpolated_xstring_node_closing_set(node, &closing); + pm_interpolated_xstring_node_closing_set(parser, node, &closing); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_USTAR: { parser_lex(parser); @@ -20516,17 +20357,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // still lex past it though and create a missing node place. if (binding_power != PM_BINDING_POWER_STATEMENT) { pm_parser_err_prefix(parser, diag_id); - return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous))); } pm_token_t operator = parser->previous; pm_node_t *name = NULL; if (token_begins_expression_p(parser->current.type)) { - name = parse_expression(parser, PM_BINDING_POWER_INDEX, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); + name = parse_expression(parser, PM_BINDING_POWER_INDEX, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_STAR, (uint16_t) (depth + 1)); } - pm_node_t *splat = (pm_node_t *) pm_splat_node_create(parser, &operator, name); + pm_node_t *splat = UP(pm_splat_node_create(parser, &operator, name)); if (match1(parser, PM_TOKEN_COMMA)) { return parse_targets_validate(parser, splat, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); @@ -20542,11 +20383,11 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t operator = parser->previous; - pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, binding_power < PM_BINDING_POWER_MATCH, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (binding_power < PM_BINDING_POWER_MATCH ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0)), PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "!"); pm_conditional_predicate(parser, receiver, PM_CONDITIONAL_PREDICATE_TYPE_NOT); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_TILDE: { if (binding_power > PM_BINDING_POWER_UNARY) { @@ -20555,10 +20396,10 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t operator = parser->previous; - pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "~"); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_UMINUS: { if (binding_power > PM_BINDING_POWER_UNARY) { @@ -20567,22 +20408,22 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t operator = parser->previous; - pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "-@"); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_UMINUS_NUM: { parser_lex(parser); pm_token_t operator = parser->previous; - pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_node_t *node = parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); if (accept1(parser, PM_TOKEN_STAR_STAR)) { pm_token_t exponent_operator = parser->previous; - pm_node_t *exponent = parse_expression(parser, pm_binding_powers[exponent_operator.type].right, false, false, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); - node = (pm_node_t *) pm_call_node_binary_create(parser, node, &exponent_operator, exponent, 0); - node = (pm_node_t *) pm_call_node_unary_create(parser, &operator, node, "-@"); + pm_node_t *exponent = parse_expression(parser, pm_binding_powers[exponent_operator.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_ARGUMENT, (uint16_t) (depth + 1)); + node = UP(pm_call_node_binary_create(parser, node, &exponent_operator, exponent, 0)); + node = UP(pm_call_node_unary_create(parser, &operator, node, "-@")); } else { switch (PM_NODE_TYPE(node)) { case PM_INTEGER_NODE: @@ -20592,7 +20433,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parse_negative_numeric(node); break; default: - node = (pm_node_t *) pm_call_node_unary_create(parser, &operator, node, "-@"); + node = UP(pm_call_node_unary_create(parser, &operator, node, "-@")); break; } } @@ -20626,13 +20467,12 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b accept1(parser, PM_TOKEN_NEWLINE); expect1(parser, PM_TOKEN_PARENTHESIS_RIGHT, PM_ERR_EXPECT_RPAREN); - pm_block_parameters_node_closing_set(block_parameters, &parser->previous); + pm_block_parameters_node_closing_set(parser, block_parameters, &parser->previous); break; } case PM_CASE_PARAMETER: { pm_accepts_block_stack_push(parser, false); - pm_token_t opening = not_provided(parser); - block_parameters = parse_block_parameters(parser, false, &opening, true, false, (uint16_t) (depth + 1)); + block_parameters = parse_block_parameters(parser, false, NULL, true, false, (uint16_t) (depth + 1)); pm_accepts_block_stack_pop(parser); break; } @@ -20650,39 +20490,37 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b opening = parser->previous; if (!match1(parser, PM_TOKEN_BRACE_RIGHT)) { - body = (pm_node_t *) parse_statements(parser, PM_CONTEXT_LAMBDA_BRACES, (uint16_t) (depth + 1)); + body = UP(parse_statements(parser, PM_CONTEXT_LAMBDA_BRACES, (uint16_t) (depth + 1))); } parser_warn_indentation_mismatch(parser, opening_newline_index, &operator, false, false); - expect1(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_LAMBDA_TERM_BRACE); + expect1_opening(parser, PM_TOKEN_BRACE_RIGHT, PM_ERR_LAMBDA_TERM_BRACE, &opening); } else { expect1(parser, PM_TOKEN_KEYWORD_DO, PM_ERR_LAMBDA_OPEN); opening = parser->previous; if (!match3(parser, PM_TOKEN_KEYWORD_END, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { - pm_accepts_block_stack_push(parser, true); - body = (pm_node_t *) parse_statements(parser, PM_CONTEXT_LAMBDA_DO_END, (uint16_t) (depth + 1)); - pm_accepts_block_stack_pop(parser); + body = UP(parse_statements(parser, PM_CONTEXT_LAMBDA_DO_END, (uint16_t) (depth + 1))); } if (match2(parser, PM_TOKEN_KEYWORD_RESCUE, PM_TOKEN_KEYWORD_ENSURE)) { assert(body == NULL || PM_NODE_TYPE_P(body, PM_STATEMENTS_NODE)); - body = (pm_node_t *) parse_rescues_implicit_begin(parser, opening_newline_index, &operator, opening.start, (pm_statements_node_t *) body, PM_RESCUES_LAMBDA, (uint16_t) (depth + 1)); + body = UP(parse_rescues_implicit_begin(parser, opening_newline_index, &operator, opening.start, (pm_statements_node_t *) body, PM_RESCUES_LAMBDA, (uint16_t) (depth + 1))); } else { parser_warn_indentation_mismatch(parser, opening_newline_index, &operator, false, false); } - expect1(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END); + expect1_opening(parser, PM_TOKEN_KEYWORD_END, PM_ERR_LAMBDA_TERM_END, &operator); } pm_constant_id_list_t locals; pm_locals_order(parser, &parser->current_scope->locals, &locals, pm_parser_scope_toplevel_p(parser)); - pm_node_t *parameters = parse_blocklike_parameters(parser, (pm_node_t *) block_parameters, &operator, &parser->previous); + pm_node_t *parameters = parse_blocklike_parameters(parser, UP(block_parameters), &operator, &parser->previous); pm_parser_scope_pop(parser); pm_accepts_block_stack_pop(parser); - return (pm_node_t *) pm_lambda_node_create(parser, &locals, &operator, &opening, &parser->previous, parameters, body); + return UP(pm_lambda_node_create(parser, &locals, &operator, &opening, &parser->previous, parameters, body)); } case PM_TOKEN_UPLUS: { if (binding_power > PM_BINDING_POWER_UNARY) { @@ -20691,13 +20529,13 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b parser_lex(parser); pm_token_t operator = parser->previous; - pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, false, false, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); + pm_node_t *receiver = parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_UNARY_RECEIVER, (uint16_t) (depth + 1)); pm_call_node_t *node = pm_call_node_unary_create(parser, &operator, receiver, "+@"); - return (pm_node_t *) node; + return UP(node); } case PM_TOKEN_STRING_BEGIN: - return parse_strings(parser, NULL, accepts_label, (uint16_t) (depth + 1)); + return parse_strings(parser, NULL, flags & PM_PARSE_ACCEPTS_LABEL, (uint16_t) (depth + 1)); case PM_TOKEN_SYMBOL_BEGIN: { pm_lex_mode_t lex_mode = *parser->lex_modes.current; parser_lex(parser); @@ -20720,17 +20558,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // If we get here, then we are assuming this token is closing a // parent context, so we'll indicate that to the user so that // they know how we behaved. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_type_human(parser->current.type), context_human(recoverable)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT, pm_token_str(parser->current.type), context_human(recoverable)); } else if (diag_id == PM_ERR_CANNOT_PARSE_EXPRESSION) { // We're going to make a special case here, because "cannot // parse expression" is pretty generic, and we know here that we // have an unexpected token. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_UNEXPECTED_TOKEN_IGNORE, pm_token_str(parser->current.type)); } else { pm_parser_err_prefix(parser, diag_id); } - return (pm_node_t *) pm_missing_node_create(parser, parser->previous.start, parser->previous.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->previous), PM_TOKEN_LENGTH(&parser->previous))); } } } @@ -20745,8 +20583,18 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b * or any of the binary operators that can be written to a variable. */ static pm_node_t * -parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { - pm_node_t *value = parse_value_expression(parser, binding_power, previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? accepts_command_call : previous_binding_power < PM_BINDING_POWER_MATCH, false, diag_id, (uint16_t) (depth + 1)); +parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { + pm_node_t *value = parse_value_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? (flags & PM_PARSE_ACCEPTS_COMMAND_CALL) : (previous_binding_power < PM_BINDING_POWER_MATCH ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0))), diag_id, (uint16_t) (depth + 1)); + + // Assignments whose value is a command call (e.g., a = b c) can only + // be followed by modifiers (if/unless/while/until/rescue) and not by + // operators with higher binding power. If we find one, emit an error + // and skip the operator and its right-hand side. + if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER && (pm_command_call_value_p(value) || pm_block_call_p(value))) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + parser_lex(parser); + parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + } // Contradicting binding powers, the right-hand-side value of the assignment // allows the `rescue` modifier. @@ -20756,10 +20604,10 @@ parse_assignment_value(pm_parser_t *parser, pm_binding_power_t previous_binding_ pm_token_t rescue = parser->current; parser_lex(parser); - pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, false, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); context_pop(parser); - return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right); + return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); } return value; @@ -20814,35 +20662,46 @@ parse_assignment_value_local(pm_parser_t *parser, const pm_node_t *node) { * operator that allows multiple values after it. */ static pm_node_t * -parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, pm_diagnostic_id_t diag_id, uint16_t depth) { +parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { bool permitted = true; if (previous_binding_power != PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_USTAR)) permitted = false; - pm_node_t *value = parse_starred_expression(parser, binding_power, previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? accepts_command_call : previous_binding_power < PM_BINDING_POWER_MATCH, diag_id, (uint16_t) (depth + 1)); + pm_node_t *value = parse_starred_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (previous_binding_power == PM_BINDING_POWER_ASSIGNMENT ? (flags & PM_PARSE_ACCEPTS_COMMAND_CALL) : (previous_binding_power < PM_BINDING_POWER_MODIFIER ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0))), diag_id, (uint16_t) (depth + 1)); if (!permitted) pm_parser_err_node(parser, value, PM_ERR_UNEXPECTED_MULTI_WRITE); parse_assignment_value_local(parser, value); bool single_value = true; - if (previous_binding_power == PM_BINDING_POWER_STATEMENT && (PM_NODE_TYPE_P(value, PM_SPLAT_NODE) || match1(parser, PM_TOKEN_COMMA))) { + // Block calls (command call + do block, e.g., `foo bar do end`) cannot + // be followed by a comma to form a multi-value RHS because each element + // of a multi-value assignment must be an `arg`, not a `block_call`. + if (previous_binding_power == PM_BINDING_POWER_STATEMENT && !pm_block_call_p(value) && (PM_NODE_TYPE_P(value, PM_SPLAT_NODE) || match1(parser, PM_TOKEN_COMMA))) { single_value = false; - pm_token_t opening = not_provided(parser); - pm_array_node_t *array = pm_array_node_create(parser, &opening); - - pm_array_node_elements_append(array, value); - value = (pm_node_t *) array; + pm_array_node_t *array = pm_array_node_create(parser, NULL); + pm_array_node_elements_append(parser->arena, array, value); + value = UP(array); while (accept1(parser, PM_TOKEN_COMMA)) { pm_node_t *element = parse_starred_expression(parser, binding_power, false, PM_ERR_ARRAY_ELEMENT, (uint16_t) (depth + 1)); - pm_array_node_elements_append(array, element); - if (PM_NODE_TYPE_P(element, PM_MISSING_NODE)) break; + pm_array_node_elements_append(parser->arena, array, element); + if (PM_NODE_TYPE_P(element, PM_ERROR_RECOVERY_NODE)) break; parse_assignment_value_local(parser, element); } } + // Assignments whose value is a command call (e.g., a = b c) can only + // be followed by modifiers (if/unless/while/until/rescue) and not by + // operators with higher binding power. If we find one, emit an error + // and skip the operator and its right-hand side. + if (single_value && pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER && (pm_command_call_value_p(value) || pm_block_call_p(value))) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(parser->current.type)); + parser_lex(parser); + parse_expression(parser, pm_binding_powers[parser->previous.type].right, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + } + // Contradicting binding powers, the right-hand-side value of the assignment // allows the `rescue` modifier. if ((single_value || (binding_power == (PM_BINDING_POWER_MULTI_ASSIGNMENT + 1))) && match1(parser, PM_TOKEN_KEYWORD_RESCUE_MODIFIER)) { @@ -20857,15 +20716,15 @@ parse_assignment_values(pm_parser_t *parser, pm_binding_power_t previous_binding // but without parenthesis. if (PM_NODE_TYPE_P(value, PM_CALL_NODE)) { pm_call_node_t *call_node = (pm_call_node_t *) value; - if ((call_node->arguments != NULL) && (call_node->opening_loc.start == NULL)) { + if ((call_node->arguments != NULL) && (call_node->opening_loc.length == 0)) { accepts_command_call_inner = true; } } - pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, accepts_command_call_inner, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + pm_node_t *right = parse_expression(parser, pm_binding_powers[PM_TOKEN_KEYWORD_RESCUE_MODIFIER].right, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (accepts_command_call_inner ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0)), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); context_pop(parser); - return (pm_node_t *) pm_rescue_modifier_node_create(parser, value, &rescue, right); + return UP(pm_rescue_modifier_node_create(parser, value, &rescue, right)); } return value; @@ -20882,43 +20741,18 @@ static void parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const pm_token_t *operator) { if (call_node->arguments != NULL) { pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_ARGUMENTS); - pm_node_destroy(parser, (pm_node_t *) call_node->arguments); + pm_node_unreference(parser, UP(call_node->arguments)); call_node->arguments = NULL; } if (call_node->block != NULL) { pm_parser_err_token(parser, operator, PM_ERR_OPERATOR_WRITE_BLOCK); - pm_node_destroy(parser, (pm_node_t *) call_node->block); + pm_node_unreference(parser, UP(call_node->block)); call_node->block = NULL; } } -/** - * This struct is used to pass information between the regular expression parser - * and the named capture callback. - */ -typedef struct { - /** The parser that is parsing the regular expression. */ - pm_parser_t *parser; - - /** The call node wrapping the regular expression node. */ - pm_call_node_t *call; - - /** The match write node that is being created. */ - pm_match_write_node_t *match; - - /** The list of names that have been parsed. */ - pm_constant_id_list_t names; - - /** - * Whether the content of the regular expression is shared. This impacts - * whether or not we used owned constants or shared constants in the - * constant pool for the names of the captures. - */ - bool shared; -} parse_regular_expression_named_capture_data_t; - -static inline const uint8_t * +static PRISM_INLINE const uint8_t * pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { cursor++; @@ -20939,7 +20773,7 @@ pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const return cursor; } -static inline const uint8_t * +static PRISM_INLINE const uint8_t * pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { uint8_t value = (uint8_t) (*cursor - '0'); cursor++; @@ -20958,8 +20792,8 @@ pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, con return cursor; } -static inline const uint8_t * -pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) { +static PRISM_INLINE const uint8_t * +pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end, const pm_location_t *error_location) { const uint8_t *start = cursor - 1; cursor++; @@ -20970,7 +20804,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con if (*cursor != '{') { size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4)); - uint32_t value = escape_unicode(parser, cursor, length); + uint32_t value = escape_unicode(parser, cursor, length, error_location, 0); if (!pm_buffer_append_unicode_codepoint(unescaped, value)) { pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start)); @@ -20990,7 +20824,10 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con } size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor); - uint32_t value = escape_unicode(parser, cursor, length); + if (length == 0) { + break; + } + uint32_t value = escape_unicode(parser, cursor, length, error_location, 0); (void) pm_buffer_append_unicode_codepoint(unescaped, value); cursor += length; @@ -21000,7 +20837,7 @@ pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, con } static void -pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) { +pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor, const pm_location_t *error_location) { const uint8_t *end = source + length; pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source)); @@ -21018,7 +20855,7 @@ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8 cursor = pm_named_capture_escape_octal(unescaped, cursor, end); break; case 'u': - cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end); + cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end, error_location); break; default: pm_buffer_append_byte(unescaped, '\\'); @@ -21040,10 +20877,7 @@ pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8 * capture group. */ static void -parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { - parse_regular_expression_named_capture_data_t *callback_data = (parse_regular_expression_named_capture_data_t *) data; - - pm_parser_t *parser = callback_data->parser; +parse_regular_expression_named_capture(pm_parser_t *parser, const pm_string_t *capture, bool shared, pm_regexp_name_data_t *callback_data) { pm_call_node_t *call = callback_data->call; pm_constant_id_list_t *names = &callback_data->names; @@ -21061,55 +20895,56 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { // unescaped, which is what we need. const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding); if (PRISM_UNLIKELY(cursor != NULL)) { - pm_named_capture_escape(parser, &unescaped, source, length, cursor); + pm_named_capture_escape(parser, &unescaped, source, length, cursor, shared ? NULL : &call->receiver->location); source = (const uint8_t *) pm_buffer_value(&unescaped); length = pm_buffer_length(&unescaped); } - pm_location_t location; + const uint8_t *start; + const uint8_t *end; pm_constant_id_t name; // If the name of the capture group isn't a valid identifier, we do // not add it to the local table. if (!pm_slice_is_valid_local(parser, source, source + length)) { - pm_buffer_free(&unescaped); + pm_buffer_cleanup(&unescaped); return; } - if (callback_data->shared) { + if (shared) { // If the unescaped string is a slice of the source, then we can // copy the names directly. The pointers will line up. - location = (pm_location_t) { .start = source, .end = source + length }; - name = pm_parser_constant_id_location(parser, location.start, location.end); + start = source; + end = source + length; + name = pm_parser_constant_id_raw(parser, start, end); } else { // Otherwise, the name is a slice of the malloc-ed owned string, // in which case we need to copy it out into a new string. - location = (pm_location_t) { .start = call->receiver->location.start, .end = call->receiver->location.end }; - - void *memory = xmalloc(length); - if (memory == NULL) abort(); + start = parser->start + PM_NODE_START(call->receiver); + end = parser->start + PM_NODE_END(call->receiver); + uint8_t *memory = (uint8_t *) pm_arena_alloc(parser->arena, length, 1); memcpy(memory, source, length); - name = pm_parser_constant_id_owned(parser, (uint8_t *) memory, length); + name = pm_parser_constant_id_owned(parser, memory, length); } // Add this name to the list of constants if it is valid, not duplicated, // and not a keyword. if (name != 0 && !pm_constant_id_list_includes(names, name)) { - pm_constant_id_list_append(names, name); + pm_constant_id_list_append(parser->arena, names, name); int depth; if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) { // If the local is not already a local but it is a keyword, then we // do not want to add a capture for this. if (pm_local_is_keyword((const char *) source, length)) { - pm_buffer_free(&unescaped); + pm_buffer_cleanup(&unescaped); return; } // If the identifier is not already a local, then we will add it to // the local table. - pm_parser_local_add(parser, name, location.start, location.end, 0); + pm_parser_local_add(parser, name, start, end, 0); } // Here we lazily create the MatchWriteNode since we know we're @@ -21120,45 +20955,37 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) { // Next, create the local variable target and add it to the list of // targets for the match. - pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth); - pm_node_list_append(&callback_data->match->targets, target); + pm_node_t *target = UP(pm_local_variable_target_node_create(parser, &TOK2LOC(parser, &((pm_token_t) { .type = 0, .start = start, .end = end })), name, depth == -1 ? 0 : (uint32_t) depth)); + pm_node_list_append(parser->arena, &callback_data->match->targets, target); } - pm_buffer_free(&unescaped); + pm_buffer_cleanup(&unescaped); } /** - * Potentially change a =~ with a regular expression with named captures into a - * match write node. + * Potentially change a =~ with an interpolated regular expression with named + * captures into a match write node. This is for the interpolated case where + * we have concatenated content rather than a regular expression node. */ static pm_node_t * -parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) { - parse_regular_expression_named_capture_data_t callback_data = { - .parser = parser, +parse_interpolated_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call, bool extended_mode) { + pm_regexp_name_data_t callback_data = { .call = call, + .match = NULL, .names = { 0 }, - .shared = content->type == PM_STRING_SHARED }; - parse_regular_expression_error_data_t error_data = { - .parser = parser, - .start = call->receiver->location.start, - .end = call->receiver->location.end, - .shared = content->type == PM_STRING_SHARED - }; - - pm_regexp_parse(parser, pm_string_source(content), pm_string_length(content), extended_mode, parse_regular_expression_named_capture, &callback_data, parse_regular_expression_error, &error_data); - pm_constant_id_list_free(&callback_data.names); + pm_regexp_parse_named_captures(parser, pm_string_source(content), pm_string_length(content), false, extended_mode, parse_regular_expression_named_capture, &callback_data); if (callback_data.match != NULL) { - return (pm_node_t *) callback_data.match; + return UP(callback_data.match); } else { - return (pm_node_t *) call; + return UP(call); } } -static inline pm_node_t * -parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, bool accepts_command_call, uint16_t depth) { +static PRISM_INLINE pm_node_t * +parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power, uint8_t flags, uint16_t depth) { pm_token_t token = parser->current; switch (token.type) { @@ -21171,13 +20998,20 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // is parsed because it could be referenced in the value. pm_call_node_t *call_node = (pm_call_node_t *) node; if (PM_NODE_FLAG_P(call_node, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { - pm_parser_local_add_location(parser, call_node->message_loc.start, call_node->message_loc.end, 0); + pm_parser_local_add_location(parser, &call_node->message_loc, 0); } } PRISM_FALLTHROUGH case PM_CASE_WRITABLE: { + // When we have `it = value`, we need to add `it` as a local + // variable before parsing the value, in case the value + // references the variable. + if (PM_NODE_TYPE_P(node, PM_IT_LOCAL_VARIABLE_READ_NODE)) { + pm_parser_local_add_location(parser, &node->location, 0); + } + parser_lex(parser); - pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) ? PM_BINDING_POWER_MULTI_ASSIGNMENT + 1 : binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) ? PM_BINDING_POWER_MULTI_ASSIGNMENT + 1 : binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); if (PM_NODE_TYPE_P(node, PM_MULTI_TARGET_NODE) && previous_binding_power != PM_BINDING_POWER_STATEMENT) { pm_parser_err_node(parser, node, PM_ERR_UNEXPECTED_MULTI_WRITE); @@ -21190,8 +21024,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_multi_target_node_targets_append(parser, multi_target, node); parser_lex(parser); - pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_BINDING_POWER_MULTI_ASSIGNMENT + 1, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); - return parse_write(parser, (pm_node_t *) multi_target, &token, value); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, PM_BINDING_POWER_MULTI_ASSIGNMENT + 1, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + return parse_write(parser, UP(multi_target), &token, value); } case PM_SOURCE_ENCODING_NODE: case PM_FALSE_NODE: @@ -21203,7 +21037,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // In these special cases, we have specific error messages // and we will replace them with local variable writes. parser_lex(parser); - pm_node_t *value = parse_assignment_values(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); + pm_node_t *value = parse_assignment_values(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_EQUAL, (uint16_t) (depth + 1)); return parse_unwriteable_write(parser, node, &token, value); } default: @@ -21224,71 +21058,65 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t case PM_GLOBAL_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_global_variable_and_write_node_create(parser, node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_and_write_node_create(parser, node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CLASS_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_class_variable_and_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_and_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CONSTANT_PATH_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_path_and_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_and_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); return parse_shareable_constant_write(parser, write); } case PM_CONSTANT_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_and_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_and_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return parse_shareable_constant_write(parser, write); } case PM_INSTANCE_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_instance_variable_and_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_and_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_IT_LOCAL_VARIABLE_READ_NODE: { pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_and_write_node_create(parser, node, &token, value, name, 0); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, node, &token, value, name, 0)); - parse_target_implicit_parameter(parser, node); - pm_node_destroy(parser, node); + pm_node_unreference(parser, node); return result; } case PM_LOCAL_VARIABLE_READ_NODE: { - if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { - PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); - parse_target_implicit_parameter(parser, node); + if (pm_token_is_numbered_parameter(parser, PM_NODE_START(node), PM_NODE_LENGTH(node))) { + PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.length, PM_ERR_PARAMETER_NUMBERED_RESERVED, parser->start + node->location.start); + pm_node_unreference(parser, node); } pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_and_write_node_create(parser, node, &token, value, cast->name, cast->depth); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, node, &token, value, cast->name, cast->depth)); - pm_node_destroy(parser, node); return result; } case PM_CALL_NODE: { @@ -21298,16 +21126,13 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // receiver that could have been a local variable) then we // will transform it into a local variable write. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { - pm_location_t *message_loc = &cast->message_loc; - pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); - - pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); + pm_refute_numbered_parameter(parser, cast->message_loc.start, cast->message_loc.length); + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, &cast->message_loc, 1); parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_and_write_node_create(parser, (pm_node_t *) cast, &token, value, constant_id, 0); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_and_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); - pm_node_destroy(parser, (pm_node_t *) cast); return result; } @@ -21319,8 +21144,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // this is an aref expression, and we can transform it into // an aset expression. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_index_and_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + return UP(pm_index_and_write_node_create(parser, cast, &token, value)); } // If this node cannot be writable, then we have an error. @@ -21331,8 +21156,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t } parse_call_operator_write(parser, cast, &token); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_and_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_AMPAMPEQ, (uint16_t) (depth + 1)); + return UP(pm_call_and_write_node_create(parser, cast, &token, value)); } case PM_MULTI_WRITE_NODE: { parser_lex(parser); @@ -21358,71 +21183,65 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t case PM_GLOBAL_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_global_variable_or_write_node_create(parser, node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_or_write_node_create(parser, node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CLASS_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_class_variable_or_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_or_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CONSTANT_PATH_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_path_or_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_or_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); return parse_shareable_constant_write(parser, write); } case PM_CONSTANT_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_or_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_or_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return parse_shareable_constant_write(parser, write); } case PM_INSTANCE_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_instance_variable_or_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_or_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_IT_LOCAL_VARIABLE_READ_NODE: { pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_or_write_node_create(parser, node, &token, value, name, 0); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, node, &token, value, name, 0)); - parse_target_implicit_parameter(parser, node); - pm_node_destroy(parser, node); + pm_node_unreference(parser, node); return result; } case PM_LOCAL_VARIABLE_READ_NODE: { - if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { - PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); - parse_target_implicit_parameter(parser, node); + if (pm_token_is_numbered_parameter(parser, PM_NODE_START(node), PM_NODE_LENGTH(node))) { + PM_PARSER_ERR_FORMAT(parser, PM_NODE_START(node), PM_NODE_LENGTH(node), PM_ERR_PARAMETER_NUMBERED_RESERVED, parser->start + PM_NODE_START(node)); + pm_node_unreference(parser, node); } pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_or_write_node_create(parser, node, &token, value, cast->name, cast->depth); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, node, &token, value, cast->name, cast->depth)); - pm_node_destroy(parser, node); return result; } case PM_CALL_NODE: { @@ -21432,16 +21251,13 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // receiver that could have been a local variable) then we // will transform it into a local variable write. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { - pm_location_t *message_loc = &cast->message_loc; - pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); - - pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); + pm_refute_numbered_parameter(parser, cast->message_loc.start, cast->message_loc.length); + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, &cast->message_loc, 1); parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_or_write_node_create(parser, (pm_node_t *) cast, &token, value, constant_id, 0); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_or_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); - pm_node_destroy(parser, (pm_node_t *) cast); return result; } @@ -21453,8 +21269,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // this is an aref expression, and we can transform it into // an aset expression. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_index_or_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + return UP(pm_index_or_write_node_create(parser, cast, &token, value)); } // If this node cannot be writable, then we have an error. @@ -21465,8 +21281,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t } parse_call_operator_write(parser, cast, &token); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_or_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ, (uint16_t) (depth + 1)); + return UP(pm_call_or_write_node_create(parser, cast, &token, value)); } case PM_MULTI_WRITE_NODE: { parser_lex(parser); @@ -21502,71 +21318,65 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t case PM_GLOBAL_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_global_variable_operator_write_node_create(parser, node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_global_variable_operator_write_node_create(parser, node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CLASS_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_class_variable_operator_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_class_variable_operator_write_node_create(parser, (pm_class_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_CONSTANT_PATH_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_path_operator_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_path_operator_write_node_create(parser, (pm_constant_path_node_t *) node, &token, value)); return parse_shareable_constant_write(parser, write); } case PM_CONSTANT_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *write = (pm_node_t *) pm_constant_operator_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *write = UP(pm_constant_operator_write_node_create(parser, (pm_constant_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return parse_shareable_constant_write(parser, write); } case PM_INSTANCE_VARIABLE_READ_NODE: { parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_instance_variable_operator_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_instance_variable_operator_write_node_create(parser, (pm_instance_variable_read_node_t *) node, &token, value)); - pm_node_destroy(parser, node); return result; } case PM_IT_LOCAL_VARIABLE_READ_NODE: { pm_constant_id_t name = pm_parser_local_add_constant(parser, "it", 2); parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_operator_write_node_create(parser, node, &token, value, name, 0); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, node, &token, value, name, 0)); - parse_target_implicit_parameter(parser, node); - pm_node_destroy(parser, node); + pm_node_unreference(parser, node); return result; } case PM_LOCAL_VARIABLE_READ_NODE: { - if (pm_token_is_numbered_parameter(node->location.start, node->location.end)) { - PM_PARSER_ERR_FORMAT(parser, node->location.start, node->location.end, PM_ERR_PARAMETER_NUMBERED_RESERVED, node->location.start); - parse_target_implicit_parameter(parser, node); + if (pm_token_is_numbered_parameter(parser, PM_NODE_START(node), PM_NODE_LENGTH(node))) { + PM_PARSER_ERR_FORMAT(parser, PM_NODE_START(node), PM_NODE_LENGTH(node), PM_ERR_PARAMETER_NUMBERED_RESERVED, parser->start + PM_NODE_START(node)); + pm_node_unreference(parser, node); } pm_local_variable_read_node_t *cast = (pm_local_variable_read_node_t *) node; parser_lex(parser); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_operator_write_node_create(parser, node, &token, value, cast->name, cast->depth); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, node, &token, value, cast->name, cast->depth)); - pm_node_destroy(parser, node); return result; } case PM_CALL_NODE: { @@ -21577,14 +21387,11 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // receiver that could have been a local variable) then we // will transform it into a local variable write. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_VARIABLE_CALL)) { - pm_location_t *message_loc = &cast->message_loc; - pm_refute_numbered_parameter(parser, message_loc->start, message_loc->end); + pm_refute_numbered_parameter(parser, cast->message_loc.start, cast->message_loc.length); + pm_constant_id_t constant_id = pm_parser_local_add_location(parser, &cast->message_loc, 1); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *result = UP(pm_local_variable_operator_write_node_create(parser, UP(cast), &token, value, constant_id, 0)); - pm_constant_id_t constant_id = pm_parser_local_add_location(parser, message_loc->start, message_loc->end, 1); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - pm_node_t *result = (pm_node_t *) pm_local_variable_operator_write_node_create(parser, (pm_node_t *) cast, &token, value, constant_id, 0); - - pm_node_destroy(parser, (pm_node_t *) cast); return result; } @@ -21592,8 +21399,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // this is an aref expression, and we can transform it into // an aset expression. if (PM_NODE_FLAG_P(cast, PM_CALL_NODE_FLAGS_INDEX)) { - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_index_operator_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_index_operator_write_node_create(parser, cast, &token, value)); } // If this node cannot be writable, then we have an error. @@ -21604,8 +21411,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t } parse_call_operator_write(parser, cast, &token); - pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, accepts_command_call, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_operator_write_node_create(parser, cast, &token, value); + pm_node_t *value = parse_assignment_value(parser, previous_binding_power, binding_power, flags, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_operator_write_node_create(parser, cast, &token, value)); } case PM_MULTI_WRITE_NODE: { parser_lex(parser); @@ -21618,7 +21425,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // In this case we have an operator but we don't know what it's for. // We need to treat it as an error. For now, we'll mark it as an error // and just skip right past it. - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->previous, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, pm_token_type_human(parser->current.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->previous, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, pm_token_str(parser->current.type)); return node; } } @@ -21626,15 +21433,15 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t case PM_TOKEN_KEYWORD_AND: { parser_lex(parser); - pm_node_t *right = parse_expression(parser, binding_power, parser->previous.type == PM_TOKEN_KEYWORD_AND, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_and_node_create(parser, node, &token, right); + pm_node_t *right = parse_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (parser->previous.type == PM_TOKEN_KEYWORD_AND ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0)), PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_and_node_create(parser, node, &token, right)); } case PM_TOKEN_KEYWORD_OR: case PM_TOKEN_PIPE_PIPE: { parser_lex(parser); - pm_node_t *right = parse_expression(parser, binding_power, parser->previous.type == PM_TOKEN_KEYWORD_OR, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_or_node_create(parser, node, &token, right); + pm_node_t *right = parse_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | (parser->previous.type == PM_TOKEN_KEYWORD_OR ? PM_PARSE_ACCEPTS_COMMAND_CALL : 0)), PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_or_node_create(parser, node, &token, right)); } case PM_TOKEN_EQUAL_TILDE: { // Note that we _must_ parse the value before adding the local @@ -21645,11 +21452,11 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // // In this case, `foo` should be a method call and not a local yet. parser_lex(parser); - pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + pm_node_t *argument = parse_expression(parser, binding_power, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); // By default, we're going to create a call node and then return it. pm_call_node_t *call = pm_call_node_binary_create(parser, node, &token, argument, 0); - pm_node_t *result = (pm_node_t *) call; + pm_node_t *result = UP(call); // If the receiver of this =~ is a regular expression node, then we // need to introduce local variables for it based on its named @@ -21690,14 +21497,25 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_string_t owned; pm_string_owned_init(&owned, (uint8_t *) memory, total_length); - result = parse_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); - pm_string_free(&owned); + result = parse_interpolated_regular_expression_named_captures(parser, &owned, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + pm_string_cleanup(&owned); } } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) { - // If we have a regular expression node, then we can just parse - // the named captures directly off the unescaped string. - const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped; - result = parse_regular_expression_named_captures(parser, content, call, PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED)); + // If we have a regular expression node, then we can parse + // the named captures and validate encoding in one pass. + pm_regular_expression_node_t *regexp = (pm_regular_expression_node_t *) node; + + pm_regexp_name_data_t name_data = { + .call = call, + .match = NULL, + .names = { 0 }, + }; + + pm_node_flag_set(UP(regexp), pm_regexp_parse(parser, regexp, parse_regular_expression_named_capture, &name_data)); + + if (name_data.match != NULL) { + result = UP(name_data.match); + } } return result; @@ -21729,21 +21547,21 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t case PM_RESCUE_MODIFIER_NODE: { pm_rescue_modifier_node_t *cast = (pm_rescue_modifier_node_t *) node; if (PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } case PM_AND_NODE: { pm_and_node_t *cast = (pm_and_node_t *) node; if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } case PM_OR_NODE: { pm_or_node_t *cast = (pm_or_node_t *) node; if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } @@ -21751,20 +21569,20 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t break; } - pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_node_binary_create(parser, node, &token, argument, 0); + pm_node_t *argument = parse_expression(parser, binding_power, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_node_binary_create(parser, node, &token, argument, 0)); } case PM_TOKEN_GREATER: case PM_TOKEN_GREATER_EQUAL: case PM_TOKEN_LESS: case PM_TOKEN_LESS_EQUAL: { if (PM_NODE_TYPE_P(node, PM_CALL_NODE) && PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_COMPARISON)) { - PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, parser->current, PM_WARN_COMPARISON_AFTER_COMPARISON); + PM_PARSER_WARN_TOKEN_FORMAT_CONTENT(parser, &parser->current, PM_WARN_COMPARISON_AFTER_COMPARISON); } parser_lex(parser); - pm_node_t *argument = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_node_binary_create(parser, node, &token, argument, PM_CALL_NODE_FLAGS_COMPARISON); + pm_node_t *argument = parse_expression(parser, binding_power, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + return UP(pm_call_node_binary_create(parser, node, &token, argument, PM_CALL_NODE_FLAGS_COMPARISON)); } case PM_TOKEN_AMPERSAND_DOT: case PM_TOKEN_DOT: { @@ -21775,28 +21593,28 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // This if statement handles the foo.() syntax. if (match1(parser, PM_TOKEN_PARENTHESIS_LEFT)) { parse_arguments_list(parser, &arguments, true, false, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_node_shorthand_create(parser, node, &operator, &arguments); + return UP(pm_call_node_shorthand_create(parser, node, &operator, &arguments)); } switch (PM_NODE_TYPE(node)) { case PM_RESCUE_MODIFIER_NODE: { pm_rescue_modifier_node_t *cast = (pm_rescue_modifier_node_t *) node; if (PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->rescue_expression, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } case PM_AND_NODE: { pm_and_node_t *cast = (pm_and_node_t *) node; if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } case PM_OR_NODE: { pm_or_node_t *cast = (pm_or_node_t *) node; if (PM_NODE_TYPE_P(cast->right, PM_MATCH_PREDICATE_NODE) || PM_NODE_TYPE_P(cast->right, PM_MATCH_REQUIRED_NODE)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_type_human(operator.type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &operator, PM_ERR_EXPECT_EOL_AFTER_STATEMENT, pm_token_str(operator.type)); } break; } @@ -21817,23 +21635,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t break; } default: { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_EXPECT_MESSAGE, pm_token_type_human(parser->current.type)); - message = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_EXPECT_MESSAGE, pm_token_str(parser->current.type)); + message = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; } } - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); pm_call_node_t *call = pm_call_node_call_create(parser, node, &operator, &message, &arguments); if ( (previous_binding_power == PM_BINDING_POWER_STATEMENT) && arguments.arguments == NULL && - arguments.opening_loc.start == NULL && + arguments.opening_loc.length == 0 && match1(parser, PM_TOKEN_COMMA) ) { - return parse_targets_validate(parser, (pm_node_t *) call, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + return parse_targets_validate(parser, UP(call), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); } else { - return (pm_node_t *) call; + return UP(call); } } case PM_TOKEN_DOT_DOT: @@ -21842,40 +21660,40 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_node_t *right = NULL; if (token_begins_expression_p(parser->current.type)) { - right = parse_expression(parser, binding_power, false, false, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); + right = parse_expression(parser, binding_power, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_EXPECT_EXPRESSION_AFTER_OPERATOR, (uint16_t) (depth + 1)); } - return (pm_node_t *) pm_range_node_create(parser, node, &token, right); + return UP(pm_range_node_create(parser, node, &token, right)); } case PM_TOKEN_KEYWORD_IF_MODIFIER: { pm_token_t keyword = parser->current; parser_lex(parser); - pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_if_node_modifier_create(parser, node, &keyword, predicate); + pm_node_t *predicate = parse_value_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_IF_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_if_node_modifier_create(parser, node, &keyword, predicate)); } case PM_TOKEN_KEYWORD_UNLESS_MODIFIER: { pm_token_t keyword = parser->current; parser_lex(parser); - pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_unless_node_modifier_create(parser, node, &keyword, predicate); + pm_node_t *predicate = parse_value_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_UNLESS_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_unless_node_modifier_create(parser, node, &keyword, predicate)); } case PM_TOKEN_KEYWORD_UNTIL_MODIFIER: { parser_lex(parser); pm_statements_node_t *statements = pm_statements_node_create(parser); pm_statements_node_body_append(parser, statements, node, true); - pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_until_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0); + pm_node_t *predicate = parse_value_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_UNTIL_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_until_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0)); } case PM_TOKEN_KEYWORD_WHILE_MODIFIER: { parser_lex(parser); pm_statements_node_t *statements = pm_statements_node_create(parser); pm_statements_node_body_append(parser, statements, node, true); - pm_node_t *predicate = parse_value_expression(parser, binding_power, true, false, PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_while_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0); + pm_node_t *predicate = parse_value_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_CONDITIONAL_WHILE_PREDICATE, (uint16_t) (depth + 1)); + return UP(pm_while_node_modifier_create(parser, &token, predicate, statements, PM_NODE_TYPE_P(node, PM_BEGIN_NODE) ? PM_LOOP_FLAGS_BEGIN_MODIFIER : 0)); } case PM_TOKEN_QUESTION_MARK: { context_push(parser, PM_CONTEXT_TERNARY); @@ -21885,7 +21703,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_token_t qmark = parser->current; parser_lex(parser); - pm_node_t *true_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_TERNARY_EXPRESSION_TRUE, (uint16_t) (depth + 1)); + pm_node_t *true_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_TERNARY_EXPRESSION_TRUE, (uint16_t) (depth + 1)); if (parser->recovering) { // If parsing the true expression of this ternary resulted in a syntax @@ -21894,27 +21712,23 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // before the `expect` function call to make sure it doesn't // accidentally move past a ':' token that occurs after the syntax // error. - pm_token_t colon = (pm_token_t) { .type = PM_TOKEN_MISSING, .start = parser->previous.end, .end = parser->previous.end }; - pm_node_t *false_expression = (pm_node_t *) pm_missing_node_create(parser, colon.start, colon.end); + pm_token_t colon = (pm_token_t) { .type = 0, .start = parser->previous.end, .end = parser->previous.end }; + pm_node_t *false_expression = UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &colon), PM_TOKEN_LENGTH(&colon))); context_pop(parser); pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression); + return UP(pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression)); } accept1(parser, PM_TOKEN_NEWLINE); expect1(parser, PM_TOKEN_COLON, PM_ERR_TERNARY_COLON); pm_token_t colon = parser->previous; - pm_node_t *false_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, false, false, PM_ERR_TERNARY_EXPRESSION_FALSE, (uint16_t) (depth + 1)); + pm_node_t *false_expression = parse_expression(parser, PM_BINDING_POWER_DEFINED, flags & PM_PARSE_ACCEPTS_DO_BLOCK, PM_ERR_TERNARY_EXPRESSION_FALSE, (uint16_t) (depth + 1)); context_pop(parser); pop_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); - - return (pm_node_t *) pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression); + return UP(pm_if_node_ternary_create(parser, node, &qmark, true_expression, &colon, false_expression)); } case PM_TOKEN_COLON_COLON: { parser_lex(parser); @@ -21927,7 +21741,7 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t if ( (parser->current.type == PM_TOKEN_PARENTHESIS_LEFT) || - (accepts_command_call && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) + ((flags & PM_PARSE_ACCEPTS_COMMAND_CALL) && (token_begins_expression_p(parser->current.type) || match3(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_USTAR_STAR))) ) { // If we have a constant immediately following a '::' operator, then // this can either be a constant path or a method call, depending on @@ -21938,11 +21752,11 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_token_t message = parser->previous; pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); - path = (pm_node_t *) pm_call_node_call_create(parser, node, &delimiter, &message, &arguments); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); + path = UP(pm_call_node_call_create(parser, node, &delimiter, &message, &arguments)); } else { // Otherwise, this is a constant path. That would look like Foo::Bar. - path = (pm_node_t *) pm_constant_path_node_create(parser, node, &delimiter, &parser->previous); + path = UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); } // If this is followed by a comma then it is a multiple assignment. @@ -21962,15 +21776,15 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t // If we have an identifier following a '::' operator, then it is for // sure a method call. pm_arguments_t arguments = { 0 }; - parse_arguments_list(parser, &arguments, true, accepts_command_call, (uint16_t) (depth + 1)); + parse_arguments_list(parser, &arguments, true, flags, (uint16_t) (depth + 1)); pm_call_node_t *call = pm_call_node_call_create(parser, node, &delimiter, &message, &arguments); // If this is followed by a comma then it is a multiple assignment. if (previous_binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { - return parse_targets_validate(parser, (pm_node_t *) call, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + return parse_targets_validate(parser, UP(call), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); } - return (pm_node_t *) call; + return UP(call); } case PM_TOKEN_PARENTHESIS_LEFT: { // If we have a parenthesis following a '::' operator, then it is the @@ -21978,11 +21792,11 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_arguments_t arguments = { 0 }; parse_arguments_list(parser, &arguments, true, false, (uint16_t) (depth + 1)); - return (pm_node_t *) pm_call_node_shorthand_create(parser, node, &delimiter, &arguments); + return UP(pm_call_node_shorthand_create(parser, node, &delimiter, &arguments)); } default: { expect1(parser, PM_TOKEN_CONSTANT, PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT); - return (pm_node_t *) pm_constant_path_node_create(parser, node, &delimiter, &parser->previous); + return UP(pm_constant_path_node_create(parser, node, &delimiter, &parser->previous)); } } } @@ -21991,31 +21805,31 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t parser_lex(parser); accept1(parser, PM_TOKEN_NEWLINE); - pm_node_t *value = parse_expression(parser, binding_power, true, false, PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); + pm_node_t *value = parse_expression(parser, binding_power, (uint8_t) ((flags & PM_PARSE_ACCEPTS_DO_BLOCK) | PM_PARSE_ACCEPTS_COMMAND_CALL), PM_ERR_RESCUE_MODIFIER_VALUE, (uint16_t) (depth + 1)); context_pop(parser); - return (pm_node_t *) pm_rescue_modifier_node_create(parser, node, &token, value); + return UP(pm_rescue_modifier_node_create(parser, node, &token, value)); } case PM_TOKEN_BRACKET_LEFT: { parser_lex(parser); pm_arguments_t arguments = { 0 }; - arguments.opening_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments.opening_loc = TOK2LOC(parser, &parser->previous); if (!accept1(parser, PM_TOKEN_BRACKET_RIGHT)) { pm_accepts_block_stack_push(parser, true); - parse_arguments(parser, &arguments, false, PM_TOKEN_BRACKET_RIGHT, (uint16_t) (depth + 1)); + parse_arguments(parser, &arguments, false, PM_TOKEN_BRACKET_RIGHT, (uint8_t) (flags & ~PM_PARSE_ACCEPTS_DO_BLOCK), (uint16_t) (depth + 1)); pm_accepts_block_stack_pop(parser); expect1(parser, PM_TOKEN_BRACKET_RIGHT, PM_ERR_EXPECT_RBRACKET); } - arguments.closing_loc = PM_LOCATION_TOKEN_VALUE(&parser->previous); + arguments.closing_loc = TOK2LOC(parser, &parser->previous); // If we have a comma after the closing bracket then this is a multiple // assignment and we should parse the targets. if (previous_binding_power == PM_BINDING_POWER_STATEMENT && match1(parser, PM_TOKEN_COMMA)) { pm_call_node_t *aref = pm_call_node_aref_create(parser, node, &arguments); - return parse_targets_validate(parser, (pm_node_t *) aref, PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); + return parse_targets_validate(parser, UP(aref), PM_BINDING_POWER_INDEX, (uint16_t) (depth + 1)); } // If we're at the end of the arguments, we can now check if there is a @@ -22031,17 +21845,17 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t if (block != NULL) { if (arguments.block != NULL) { - pm_parser_err_node(parser, (pm_node_t *) block, PM_ERR_ARGUMENT_AFTER_BLOCK); + pm_parser_err_node(parser, UP(block), PM_ERR_ARGUMENT_AFTER_BLOCK); if (arguments.arguments == NULL) { arguments.arguments = pm_arguments_node_create(parser); } - pm_arguments_node_arguments_append(arguments.arguments, arguments.block); + pm_arguments_node_arguments_append(parser->arena, arguments.arguments, arguments.block); } - arguments.block = (pm_node_t *) block; + arguments.block = UP(block); } - return (pm_node_t *) pm_call_node_aref_create(parser, node, &arguments); + return UP(pm_call_node_aref_create(parser, node, &arguments)); } case PM_TOKEN_KEYWORD_IN: { bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; @@ -22056,9 +21870,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_IN, (uint16_t) (depth + 1)); parser->pattern_matching_newlines = previous_pattern_matching_newlines; - pm_constant_id_list_free(&captures); - return (pm_node_t *) pm_match_predicate_node_create(parser, node, pattern, &operator); + return UP(pm_match_predicate_node_create(parser, node, pattern, &operator)); } case PM_TOKEN_EQUAL_GREATER: { bool previous_pattern_matching_newlines = parser->pattern_matching_newlines; @@ -22073,9 +21886,8 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t pm_node_t *pattern = parse_pattern(parser, &captures, PM_PARSE_PATTERN_TOP | PM_PARSE_PATTERN_MULTI, PM_ERR_PATTERN_EXPRESSION_AFTER_HROCKET, (uint16_t) (depth + 1)); parser->pattern_matching_newlines = previous_pattern_matching_newlines; - pm_constant_id_list_free(&captures); - return (pm_node_t *) pm_match_required_node_create(parser, node, pattern, &operator); + return UP(pm_match_required_node_create(parser, node, pattern, &operator)); } default: assert(false && "unreachable"); @@ -22088,16 +21900,83 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t #undef PM_PARSE_PATTERN_MULTI /** - * Determine if a given call node looks like a "command", which means it has - * arguments but does not have parentheses. + * Some nodes act as statements and limit which operators can follow. This + * function inspects the node and the upcoming token to determine whether the + * expression loop should stop. It is called both after prefix parsing and after + * each infix operator. + * + * As a side effect, this function also attaches do-blocks to command-style call + * nodes when appropriate. + * + * Returns true if the expression loop should stop (i.e., the next operator + * should not be consumed). */ -static inline bool -pm_call_node_command_p(const pm_call_node_t *node) { - return ( - (node->opening_loc.start == NULL) && - (node->block == NULL || PM_NODE_TYPE_P(node->block, PM_BLOCK_ARGUMENT_NODE)) && - (node->arguments != NULL || node->block != NULL) - ); +static bool +parse_expression_terminator(pm_parser_t *parser, pm_node_t *node) { + pm_binding_power_t left = pm_binding_powers[parser->current.type].left; + + switch (PM_NODE_TYPE(node)) { + case PM_MULTI_WRITE_NODE: + case PM_RETURN_NODE: + case PM_BREAK_NODE: + case PM_NEXT_NODE: + return left > PM_BINDING_POWER_MODIFIER; + case PM_CLASS_VARIABLE_WRITE_NODE: + case PM_CONSTANT_PATH_WRITE_NODE: + case PM_CONSTANT_WRITE_NODE: + case PM_GLOBAL_VARIABLE_WRITE_NODE: + case PM_INSTANCE_VARIABLE_WRITE_NODE: + case PM_LOCAL_VARIABLE_WRITE_NODE: + return PM_NODE_FLAG_P(node, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY) && left > PM_BINDING_POWER_MODIFIER; + case PM_CALL_NODE: { + // Calls with an implicit array on the right-hand side are + // statements and can only be followed by modifiers. + if (PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY)) { + return left > PM_BINDING_POWER_MODIFIER; + } + + // Command-style calls (including block commands like + // `foo bar do end`) can only be followed by composition + // (and/or) and modifier (if/unless/etc.) operators. + if (pm_command_call_value_p(node)) { + return left > PM_BINDING_POWER_COMPOSITION; + } + + // A block call (command with do-block, or any call chained + // from one) can only be followed by call chaining (., ::, + // &.), composition (and/or), and modifier operators. + if (pm_block_call_p(node)) { + return left > PM_BINDING_POWER_COMPOSITION && left < PM_BINDING_POWER_CALL; + } + + return false; + } + case PM_SUPER_NODE: + case PM_YIELD_NODE: + // Command-style super/yield (without parens) can only be followed + // by composition and modifier operators. + if (pm_command_call_value_p(node)) { + return left > PM_BINDING_POWER_COMPOSITION; + } + return false; + case PM_DEF_NODE: + // An endless method whose body is a command-style call (e.g., + // `def f = foo bar`) is a command assignment and can only be + // followed by modifiers. + return left > PM_BINDING_POWER_MODIFIER && pm_command_call_value_p(node); + case PM_RESCUE_MODIFIER_NODE: + // A rescue modifier whose handler is a pattern match (=> or in) + // produces a statement and cannot be followed by operators above + // the modifier level. + if (left > PM_BINDING_POWER_MODIFIER) { + pm_rescue_modifier_node_t *cast = (pm_rescue_modifier_node_t *) node; + pm_node_t *rescue_expression = cast->rescue_expression; + return PM_NODE_TYPE_P(rescue_expression, PM_MATCH_REQUIRED_NODE) || PM_NODE_TYPE_P(rescue_expression, PM_MATCH_PREDICATE_NODE); + } + return false; + default: + return false; + } } /** @@ -22109,46 +21988,40 @@ pm_call_node_command_p(const pm_call_node_t *node) { * determine if they need to perform additional cleanup. */ static pm_node_t * -parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool accepts_command_call, bool accepts_label, pm_diagnostic_id_t diag_id, uint16_t depth) { +parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, uint8_t flags, pm_diagnostic_id_t diag_id, uint16_t depth) { if (PRISM_UNLIKELY(depth >= PRISM_DEPTH_MAXIMUM)) { pm_parser_err_current(parser, PM_ERR_NESTING_TOO_DEEP); - return (pm_node_t *) pm_missing_node_create(parser, parser->current.start, parser->current.end); + return UP(pm_error_recovery_node_create(parser, PM_TOKEN_START(parser, &parser->current), PM_TOKEN_LENGTH(&parser->current))); } - pm_node_t *node = parse_expression_prefix(parser, binding_power, accepts_command_call, accepts_label, diag_id, depth); + pm_node_t *node = parse_expression_prefix(parser, binding_power, flags, diag_id, depth); + // Some prefix nodes are statements and can only be followed by modifiers + // (if/unless/while/until/rescue) or nothing at all. We check these cheaply + // here before entering the infix loop. switch (PM_NODE_TYPE(node)) { - case PM_MISSING_NODE: - // If we found a syntax error, then the type of node returned by - // parse_expression_prefix is going to be a missing node. + case PM_ERROR_RECOVERY_NODE: return node; case PM_PRE_EXECUTION_NODE: + return node; case PM_POST_EXECUTION_NODE: case PM_ALIAS_GLOBAL_VARIABLE_NODE: case PM_ALIAS_METHOD_NODE: - case PM_MULTI_WRITE_NODE: case PM_UNDEF_NODE: - // These expressions are statements, and cannot be followed by - // operators (except modifiers). if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { return node; } break; case PM_CALL_NODE: - // If we have a call node, then we need to check if it looks like a - // method call without parentheses that contains arguments. If it - // does, then it has different rules for parsing infix operators, - // namely that it only accepts composition (and/or) and modifiers - // (if/unless/etc.). - if ((pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_COMPOSITION) && pm_call_node_command_p((pm_call_node_t *) node)) { + case PM_SUPER_NODE: + case PM_YIELD_NODE: + case PM_DEF_NODE: + if (parse_expression_terminator(parser, node)) { return node; } break; case PM_SYMBOL_NODE: - // If we have a symbol node that is being parsed as a label, then we - // need to immediately return, because there should never be an - // infix operator following this node. - if (pm_symbol_node_label_p(node)) { + if (pm_symbol_node_label_p(parser, node)) { return node; } break; @@ -22156,8 +22029,8 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc break; } - // Otherwise we'll look and see if the next token can be parsed as an infix - // operator. If it can, then we'll parse it using parse_expression_infix. + // Look and see if the next token can be parsed as an infix operator. If it + // can, then we'll parse it using parse_expression_infix. pm_binding_powers_t current_binding_powers; pm_token_type_t current_token_type; @@ -22167,39 +22040,8 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc binding_power <= current_binding_powers.left && current_binding_powers.binary ) { - node = parse_expression_infix(parser, node, binding_power, current_binding_powers.right, accepts_command_call, (uint16_t) (depth + 1)); - - switch (PM_NODE_TYPE(node)) { - case PM_MULTI_WRITE_NODE: - // Multi-write nodes are statements, and cannot be followed by - // operators except modifiers. - if (pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { - return node; - } - break; - case PM_CLASS_VARIABLE_WRITE_NODE: - case PM_CONSTANT_PATH_WRITE_NODE: - case PM_CONSTANT_WRITE_NODE: - case PM_GLOBAL_VARIABLE_WRITE_NODE: - case PM_INSTANCE_VARIABLE_WRITE_NODE: - case PM_LOCAL_VARIABLE_WRITE_NODE: - // These expressions are statements, by virtue of the right-hand - // side of their write being an implicit array. - if (PM_NODE_FLAG_P(node, PM_WRITE_NODE_FLAGS_IMPLICIT_ARRAY) && pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { - return node; - } - break; - case PM_CALL_NODE: - // These expressions are also statements, by virtue of the - // right-hand side of the expression (i.e., the last argument to - // the call node) being an implicit array. - if (PM_NODE_FLAG_P(node, PM_CALL_NODE_FLAGS_IMPLICIT_ARRAY) && pm_binding_powers[parser->current.type].left > PM_BINDING_POWER_MODIFIER) { - return node; - } - break; - default: - break; - } + node = parse_expression_infix(parser, node, binding_power, current_binding_powers.right, flags, (uint16_t) (depth + 1)); + if (parse_expression_terminator(parser, node)) return node; // If the operator is nonassoc and we should not be able to parse the // upcoming infix operator, break. @@ -22207,7 +22049,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc // If this is a non-assoc operator and we are about to parse the // exact same operator, then we need to add an error. if (match1(parser, current_token_type)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_type_human(parser->current.type), pm_token_type_human(current_token_type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_str(parser->current.type), pm_token_str(current_token_type)); break; } @@ -22220,7 +22062,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc // if (PM_NODE_TYPE_P(node, PM_RANGE_NODE) && ((pm_range_node_t *) node)->right == NULL) { if (match4(parser, PM_TOKEN_UAMPERSAND, PM_TOKEN_USTAR, PM_TOKEN_DOT, PM_TOKEN_AMPERSAND_DOT)) { - PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_type_human(parser->current.type), pm_token_type_human(current_token_type)); + PM_PARSER_ERR_TOKEN_FORMAT(parser, &parser->current, PM_ERR_NON_ASSOCIATIVE_OPERATOR, pm_token_str(parser->current.type), pm_token_str(current_token_type)); break; } @@ -22232,7 +22074,7 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc } } - if (accepts_command_call) { + if (flags & PM_PARSE_ACCEPTS_COMMAND_CALL) { // A command-style method call is only accepted on method chains. // Thus, we check whether the parsed node can continue method chains. // The method chain can continue if the parsed node is one of the following five kinds: @@ -22247,29 +22089,29 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc if ( // (1) foo[1] !( - cast->call_operator_loc.start == NULL && - cast->message_loc.start != NULL && - cast->message_loc.start[0] == '[' && - cast->message_loc.end[-1] == ']' + cast->call_operator_loc.length == 0 && + cast->message_loc.length > 0 && + parser->start[cast->message_loc.start] == '[' && + parser->start[cast->message_loc.start + cast->message_loc.length - 1] == ']' ) && // (2) foo.bar !( - cast->call_operator_loc.start != NULL && + cast->call_operator_loc.length > 0 && cast->arguments == NULL && cast->block == NULL && - cast->opening_loc.start == NULL + cast->opening_loc.length == 0 ) && // (3) foo.bar(1) !( - cast->call_operator_loc.start != NULL && - cast->opening_loc.start != NULL + cast->call_operator_loc.length > 0 && + cast->opening_loc.length > 0 ) && // (4) foo.bar do end !( cast->block != NULL && PM_NODE_TYPE_P(cast->block, PM_BLOCK_NODE) ) ) { - accepts_command_call = false; + flags &= (uint8_t) ~PM_PARSE_ACCEPTS_COMMAND_CALL; } break; } @@ -22277,10 +22119,21 @@ parse_expression(pm_parser_t *parser, pm_binding_power_t binding_power, bool acc case PM_CONSTANT_PATH_NODE: break; default: - accepts_command_call = false; + flags &= (uint8_t) ~PM_PARSE_ACCEPTS_COMMAND_CALL; break; } } + + if (context_terminator(parser->current_context->context, &parser->current)) { + pm_binding_powers_t next_binding_powers = pm_binding_powers[parser->current.type]; + if ( + !next_binding_powers.binary || + binding_power > next_binding_powers.left || + (PM_NODE_TYPE_P(node, PM_CALL_NODE) && pm_call_node_command_p((pm_call_node_t *) node)) + ) { + return node; + } + } } return node; @@ -22299,15 +22152,16 @@ wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) { pm_arguments_node_t *arguments = pm_arguments_node_create(parser); pm_arguments_node_arguments_append( + parser->arena, arguments, - (pm_node_t *) pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$_", 2)) + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$_", 2))) ); - pm_statements_node_body_append(parser, statements, (pm_node_t *) pm_call_node_fcall_synthesized_create( + pm_statements_node_body_append(parser, statements, UP(pm_call_node_fcall_synthesized_create( parser, arguments, pm_parser_constant_id_constant(parser, "print", 5) - ), true); + )), true); } if (PM_PARSER_COMMAND_LINE_OPTION_N(parser)) { @@ -22318,47 +22172,49 @@ wrap_statements(pm_parser_t *parser, pm_statements_node_t *statements) { pm_arguments_node_t *arguments = pm_arguments_node_create(parser); pm_arguments_node_arguments_append( + parser->arena, arguments, - (pm_node_t *) pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$;", 2)) + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$;", 2))) ); pm_global_variable_read_node_t *receiver = pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$_", 2)); - pm_call_node_t *call = pm_call_node_call_synthesized_create(parser, (pm_node_t *) receiver, "split", arguments); + pm_call_node_t *call = pm_call_node_call_synthesized_create(parser, UP(receiver), "split", arguments); pm_global_variable_write_node_t *write = pm_global_variable_write_node_synthesized_create( parser, pm_parser_constant_id_constant(parser, "$F", 2), - (pm_node_t *) call + UP(call) ); - pm_statements_node_body_prepend(statements, (pm_node_t *) write); + pm_statements_node_body_prepend(parser->arena, statements, UP(write)); } pm_arguments_node_t *arguments = pm_arguments_node_create(parser); pm_arguments_node_arguments_append( + parser->arena, arguments, - (pm_node_t *) pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$/", 2)) + UP(pm_global_variable_read_node_synthesized_create(parser, pm_parser_constant_id_constant(parser, "$/", 2))) ); if (PM_PARSER_COMMAND_LINE_OPTION_L(parser)) { pm_keyword_hash_node_t *keywords = pm_keyword_hash_node_create(parser); - pm_keyword_hash_node_elements_append(keywords, (pm_node_t *) pm_assoc_node_create( + pm_keyword_hash_node_elements_append(parser->arena, keywords, UP(pm_assoc_node_create( parser, - (pm_node_t *) pm_symbol_node_synthesized_create(parser, "chomp"), - &(pm_token_t) { .type = PM_TOKEN_NOT_PROVIDED, .start = parser->start, .end = parser->start }, - (pm_node_t *) pm_true_node_synthesized_create(parser) - )); + UP(pm_symbol_node_synthesized_create(parser, "chomp")), + NULL, + UP(pm_true_node_synthesized_create(parser)) + ))); - pm_arguments_node_arguments_append(arguments, (pm_node_t *) keywords); - pm_node_flag_set((pm_node_t *) arguments, PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS); + pm_arguments_node_arguments_append(parser->arena, arguments, UP(keywords)); + pm_node_flag_set(UP(arguments), PM_ARGUMENTS_NODE_FLAGS_CONTAINS_KEYWORDS); } pm_statements_node_t *wrapped_statements = pm_statements_node_create(parser); - pm_statements_node_body_append(parser, wrapped_statements, (pm_node_t *) pm_while_node_synthesized_create( + pm_statements_node_body_append(parser, wrapped_statements, UP(pm_while_node_synthesized_create( parser, - (pm_node_t *) pm_call_node_fcall_synthesized_create(parser, arguments, pm_parser_constant_id_constant(parser, "gets", 4)), + UP(pm_call_node_fcall_synthesized_create(parser, arguments, pm_parser_constant_id_constant(parser, "gets", 4))), statements - ), true); + )), true); statements = wrapped_statements; } @@ -22402,7 +22258,6 @@ parse_program(pm_parser_t *parser) { statements = wrap_statements(parser, statements); } else { flush_block_exits(parser, previous_block_exits); - pm_node_list_free(¤t_block_exits); } // If this is an empty file, then we're still going to parse all of the @@ -22410,10 +22265,10 @@ parse_program(pm_parser_t *parser) { // correct the location information. if (statements == NULL) { statements = pm_statements_node_create(parser); - pm_statements_node_location_set(statements, parser->start, parser->start); + statements->base.location = (pm_location_t) { 0 }; } - return (pm_node_t *) pm_program_node_create(parser, &locals, statements); + return UP(pm_program_node_create(parser, &locals, statements)); } /******************************************************************************/ @@ -22422,8 +22277,8 @@ parse_program(pm_parser_t *parser) { /** * A vendored version of strnstr that is used to find a substring within a - * string with a given length. This function is used to search for the Ruby - * engine name within a shebang when the -x option is passed to Ruby. + * string with a given length. This function is used to search for "ruby" + * within a shebang when the -x option is passed to Ruby. * * The only modification that we made here is that we don't do NULL byte checks * because we know the little parameter will not have a NULL byte and we allow @@ -22433,7 +22288,7 @@ static const char * pm_strnstr(const char *big, const char *little, size_t big_length) { size_t little_length = strlen(little); - for (const char *big_end = big + big_length; big < big_end; big++) { + for (const char *max = big + big_length - little_length; big <= max; big++) { if (*big == *little && memcmp(big, little, little_length) == 0) return big; } @@ -22451,7 +22306,7 @@ pm_strnstr(const char *big, const char *little, size_t big_length) { static void pm_parser_warn_shebang_carriage_return(pm_parser_t *parser, const uint8_t *start, size_t length) { if (length > 2 && start[length - 2] == '\r' && start[length - 1] == '\n') { - pm_parser_warn(parser, start, start + length, PM_WARN_SHEBANG_CARRIAGE_RETURN); + pm_parser_warn(parser, U32(start - parser->start), U32(length), PM_WARN_SHEBANG_CARRIAGE_RETURN); } } #endif @@ -22486,11 +22341,14 @@ pm_parser_init_shebang(pm_parser_t *parser, const pm_options_t *options, const c /** * Initialize a parser with the given start and end pointers. */ -PRISM_EXPORTED_FUNCTION void -pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options) { +void +pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options) { + assert(arena != NULL); assert(source != NULL); *parser = (pm_parser_t) { + .arena = arena, + .metadata_arena = { 0 }, .node_id = 0, .lex_state = PM_LEX_STATE_BEG, .enclosure_nesting = 0, @@ -22509,7 +22367,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .current = { .type = PM_TOKEN_EOF, .start = source, .end = source }, .next_start = NULL, .heredoc_end = NULL, - .data_loc = { .start = NULL, .end = NULL }, + .data_loc = { 0 }, .comment_list = { 0 }, .magic_comment_list = { 0 }, .warning_list = { 0 }, @@ -22519,11 +22377,11 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .encoding = PM_ENCODING_UTF_8_ENTRY, .encoding_changed_callback = NULL, .encoding_comment_start = source, - .lex_callback = NULL, + .lex_callback = { 0 }, .filepath = { 0 }, .constant_pool = { 0 }, - .newline_list = { 0 }, - .integer_base = 0, + .line_offsets = { 0 }, + .integer = { 0 }, .current_string = PM_STRING_EMPTY, .start_line = 1, .explicit_encoding = NULL, @@ -22532,6 +22390,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .partial_script = false, .command_start = true, .recovering = false, + .continuable = true, .encoding_locked = false, .encoding_changed = false, .pattern_matching_newlines = false, @@ -22539,32 +22398,30 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .current_block_exits = NULL, .semantic_token_seen = false, .frozen_string_literal = PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET, - .current_regular_expression_ascii_only = false, .warn_mismatched_indentation = true }; - // Initialize the constant pool. We're going to completely guess as to the - // number of constants that we'll need based on the size of the input. The - // ratio we chose here is actually less arbitrary than you might think. - // - // We took ~50K Ruby files and measured the size of the file versus the - // number of constants that were found in those files. Then we found the - // average and standard deviation of the ratios of constants/bytesize. Then - // we added 1.34 standard deviations to the average to get a ratio that - // would fit 75% of the files (for a two-tailed distribution). This works - // because there was about a 0.77 correlation and the distribution was - // roughly normal. - // - // This ratio will need to change if we add more constants to the constant - // pool for another node type. - uint32_t constant_size = ((uint32_t) size) / 95; - pm_constant_pool_init(&parser->constant_pool, constant_size < 4 ? 4 : constant_size); - - // Initialize the newline list. Similar to the constant pool, we're going to - // guess at the number of newlines that we'll need based on the size of the - // input. + /* Pre-size the arenas based on input size to reduce the number of block + * allocations (and the kernel page zeroing they trigger). The ratios were + * measured empirically: AST arena ~3.3x input, metadata arena ~1.1x input. + * The reserve call is a no-op when the capacity is at or below the default + * arena block size, so small inputs don't waste an extra allocation. */ + if (size <= SIZE_MAX / 4) pm_arena_reserve(arena, size * 4); + if (size <= SIZE_MAX / 5 * 4) pm_arena_reserve(&parser->metadata_arena, size + size / 4); + + /* Initialize the constant pool. Measured across 1532 Ruby stdlib files, the + * bytes/constant ratio has a median of ~56 and a 90th percentile of ~135. + * We use 120 as a balance between over-allocation waste and resize + * frequency. Resizes are cheap with arena allocation, so we lean toward + * under-estimating. */ + uint32_t constant_size = ((uint32_t) size) / 120; + pm_constant_pool_init(&parser->metadata_arena, &parser->constant_pool, constant_size < 4 ? 4 : constant_size); + + /* Initialize the line offset list. Similar to the constant pool, we are + * going to estimate the number of newlines that we will need based on the + * size of the input. */ size_t newline_size = size / 22; - pm_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size); + pm_line_offset_list_init(&parser->metadata_arena, &parser->line_offsets, newline_size < 4 ? 4 : newline_size); // If options were provided to this parse, establish them here. if (options != NULL) { @@ -22601,7 +22458,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm if (parser->parsing_eval) parser->warn_mismatched_indentation = false; for (size_t scope_index = 0; scope_index < options->scopes_count; scope_index++) { - const pm_options_scope_t *scope = pm_options_scope_get(options, scope_index); + const pm_options_scope_t *scope = pm_options_scope(options, scope_index); pm_parser_scope_push(parser, scope_index == 0); // Scopes given from the outside are not allowed to have numbered @@ -22609,20 +22466,24 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm parser->current_scope->parameters = ((pm_scope_parameters_t) scope->forwarding) | PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED; for (size_t local_index = 0; local_index < scope->locals_count; local_index++) { - const pm_string_t *local = pm_options_scope_local_get(scope, local_index); + const pm_string_t *local = pm_options_scope_local(scope, local_index); const uint8_t *source = pm_string_source(local); size_t length = pm_string_length(local); - void *allocated = xmalloc(length); - if (allocated == NULL) continue; - + uint8_t *allocated = (uint8_t *) pm_arena_alloc(&parser->metadata_arena, length, 1); memcpy(allocated, source, length); - pm_parser_local_add_owned(parser, (uint8_t *) allocated, length); + pm_parser_local_add_owned(parser, allocated, length); } } } + // Now that we have established the user-provided options, check if + // a version was given and parse as the latest version otherwise. + if (parser->version == PM_OPTIONS_VERSION_UNSET) { + parser->version = PM_OPTIONS_VERSION_LATEST; + } + pm_accepts_block_stack_push(parser, true); // Skip past the UTF-8 BOM if it exists. @@ -22656,8 +22517,8 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm // If the shebang does not include "ruby" and this is the main script being // parsed, then we will start searching the file for a shebang that does // contain "ruby" as if -x were passed on the command line. - const uint8_t *newline = next_newline(parser->start, parser->end - parser->start); - size_t length = (size_t) ((newline != NULL ? newline : parser->end) - parser->start); + const uint8_t *newline = next_newline(parser->current.end, parser->end - parser->current.end); + size_t length = (size_t) ((newline != NULL ? newline : parser->end) - parser->current.end); if (length > 2 && parser->current.end[0] == '#' && parser->current.end[1] == '!') { const char *engine; @@ -22676,7 +22537,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm } search_shebang = false; - } else if (options->main_script && !parser->parsing_eval) { + } else if (options != NULL && options->main_script && !parser->parsing_eval) { search_shebang = true; } } @@ -22697,7 +22558,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm const uint8_t *newline = next_newline(cursor, parser->end - cursor); while (newline != NULL) { - pm_newline_list_append(&parser->newline_list, newline); + pm_line_offset_list_append(&parser->metadata_arena, &parser->line_offsets, U32(newline - parser->start + 1)); cursor = newline + 1; newline = next_newline(cursor, parser->end - cursor); @@ -22726,8 +22587,8 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm parser->previous = (pm_token_t) { .type = PM_TOKEN_EOF, .start = cursor, .end = cursor }; parser->current = (pm_token_t) { .type = PM_TOKEN_EOF, .start = cursor, .end = cursor }; } else { - pm_parser_err(parser, parser->start, parser->start, PM_ERR_SCRIPT_NOT_FOUND); - pm_newline_list_clear(&parser->newline_list); + pm_parser_err(parser, 0, 0, PM_ERR_SCRIPT_NOT_FOUND); + pm_line_offset_list_clear(&parser->line_offsets); } } @@ -22738,56 +22599,28 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm } /** - * Register a callback that will be called whenever prism changes the encoding - * it is using to parse based on the magic comment. - */ -PRISM_EXPORTED_FUNCTION void -pm_parser_register_encoding_changed_callback(pm_parser_t *parser, pm_encoding_changed_callback_t callback) { - parser->encoding_changed_callback = callback; -} - -/** - * Free all of the memory associated with the comment list. - */ -static inline void -pm_comment_list_free(pm_list_t *list) { - pm_list_node_t *node, *next; - - for (node = list->head; node != NULL; node = next) { - next = node->next; - - pm_comment_t *comment = (pm_comment_t *) node; - xfree(comment); - } -} - -/** - * Free all of the memory associated with the magic comment list. + * Allocate and initialize a parser with the given start and end pointers. + * + * The resulting parser must eventually be freed with `pm_parser_free()`. The + * arena is caller-owned and must outlive the parser — `pm_parser_cleanup()` + * does not free the arena. */ -static inline void -pm_magic_comment_list_free(pm_list_t *list) { - pm_list_node_t *node, *next; - - for (node = list->head; node != NULL; node = next) { - next = node->next; +pm_parser_t * +pm_parser_new(pm_arena_t *arena, const uint8_t *source, size_t size, const pm_options_t *options) { + pm_parser_t *parser = (pm_parser_t *) xmalloc(sizeof(pm_parser_t)); + if (parser == NULL) abort(); - pm_magic_comment_t *magic_comment = (pm_magic_comment_t *) node; - xfree(magic_comment); - } + pm_parser_init(arena, parser, source, size, options); + return parser; } /** * Free any memory associated with the given parser. */ -PRISM_EXPORTED_FUNCTION void -pm_parser_free(pm_parser_t *parser) { - pm_string_free(&parser->filepath); - pm_diagnostic_list_free(&parser->error_list); - pm_diagnostic_list_free(&parser->warning_list); - pm_comment_list_free(&parser->comment_list); - pm_magic_comment_list_free(&parser->magic_comment_list); - pm_constant_pool_free(&parser->constant_pool); - pm_newline_list_free(&parser->newline_list); +void +pm_parser_cleanup(pm_parser_t *parser) { + pm_string_cleanup(&parser->filepath); + pm_arena_cleanup(&parser->metadata_arena); while (parser->current_scope != NULL) { // Normally, popping the scope doesn't free the locals since it is @@ -22803,145 +22636,224 @@ pm_parser_free(pm_parser_t *parser) { } /** - * Parse the Ruby source associated with the given parser and return the tree. + * Free both the memory held by the given parser and the parser itself. */ -PRISM_EXPORTED_FUNCTION pm_node_t * -pm_parse(pm_parser_t *parser) { - return parse_program(parser); +void +pm_parser_free(pm_parser_t *parser) { + pm_parser_cleanup(parser); + xfree_sized(parser, sizeof(pm_parser_t)); } /** - * Read into the stream until the gets callback returns false. If the last read - * line from the stream matches an __END__ marker, then halt and return false, - * otherwise return true. + * Returns true if the given diagnostic ID represents an error that cannot be + * fixed by appending more input. These are errors where the existing source + * contains definitively invalid syntax (as opposed to merely incomplete input). */ static bool -pm_parse_stream_read(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets) { -#define LINE_SIZE 4096 - char line[LINE_SIZE]; - - while (memset(line, '\n', LINE_SIZE), stream_fgets(line, LINE_SIZE, stream) != NULL) { - size_t length = LINE_SIZE; - while (length > 0 && line[length - 1] == '\n') length--; - - if (length == LINE_SIZE) { - // If we read a line that is the maximum size and it doesn't end - // with a newline, then we'll just append it to the buffer and - // continue reading. - length--; - pm_buffer_append_string(buffer, line, length); - continue; - } - - // Append the line to the buffer. - length--; - pm_buffer_append_string(buffer, line, length); - - // Check if the line matches the __END__ marker. If it does, then stop - // reading and return false. In most circumstances, this means we should - // stop reading from the stream so that the DATA constant can pick it - // up. - switch (length) { - case 7: - if (strncmp(line, "__END__", 7) == 0) return false; - break; - case 8: - if (strncmp(line, "__END__\n", 8) == 0) return false; - break; - case 9: - if (strncmp(line, "__END__\r\n", 9) == 0) return false; - break; - } +pm_parse_err_is_fatal(pm_diagnostic_id_t diag_id) { + switch (diag_id) { + case PM_ERR_ARRAY_EXPRESSION_AFTER_STAR: + case PM_ERR_BEGIN_UPCASE_BRACE: + case PM_ERR_CLASS_VARIABLE_BARE: + case PM_ERR_END_UPCASE_BRACE: + case PM_ERR_ESCAPE_INVALID_HEXADECIMAL: + case PM_ERR_ESCAPE_INVALID_UNICODE_LIST: + case PM_ERR_ESCAPE_INVALID_UNICODE_SHORT: + case PM_ERR_EXPRESSION_NOT_WRITABLE: + case PM_ERR_EXPRESSION_NOT_WRITABLE_SELF: + case PM_ERR_FLOAT_PARSE: + case PM_ERR_GLOBAL_VARIABLE_BARE: + case PM_ERR_HASH_KEY: + case PM_ERR_HEREDOC_IDENTIFIER: + case PM_ERR_INSTANCE_VARIABLE_BARE: + case PM_ERR_INVALID_BLOCK_EXIT: + case PM_ERR_INVALID_ENCODING_MAGIC_COMMENT: + case PM_ERR_INVALID_FLOAT_EXPONENT: + case PM_ERR_INVALID_NUMBER_BINARY: + case PM_ERR_INVALID_NUMBER_DECIMAL: + case PM_ERR_INVALID_NUMBER_HEXADECIMAL: + case PM_ERR_INVALID_NUMBER_OCTAL: + case PM_ERR_INVALID_NUMBER_UNDERSCORE_TRAILING: + case PM_ERR_NO_LOCAL_VARIABLE: + case PM_ERR_PARAMETER_ORDER: + case PM_ERR_STATEMENT_UNDEF: + case PM_ERR_VOID_EXPRESSION: + return true; + default: + return false; } - - return true; -#undef LINE_SIZE } /** - * Determine if there was an unterminated heredoc at the end of the input, which - * would mean the stream isn't finished and we should keep reading. + * Determine whether the source parsed by the given parser could become valid if + * more input were appended. This is used by tools like IRB to decide whether to + * prompt for continuation or to display an error. + * + * The parser starts with continuable=true. This function scans all errors to + * detect two categories of non-continuable errors: + * + * 1. Fatal errors: errors like invalid number literals or bare global variables + * that indicate definitively invalid syntax. These are only considered fatal + * if they occur before EOF (at EOF they could be from truncated input, e.g. + * `"\x` is an incomplete hex escape). * - * For the other lex modes we can check if the lex mode has been closed, but for - * heredocs when we hit EOF we close the lex mode and then go back to parse the - * rest of the line after the heredoc declaration so that we get more of the - * syntax tree. + * 2. Stray tokens: unexpected_token_ignore and unexpected_token_close_context + * errors indicate tokens that don't belong. A stray token is a cascade + * effect (and does not prevent continuability) if: + * + * a. A non-stray, non-fatal error appeared earlier in the error list at a + * strictly earlier source position (the stray was caused by a preceding + * parse failure, e.g. a truncated heredoc), OR + * b. The stray token is at EOF, starts after position 0 (there is valid + * code before it), and either is a single byte (likely a truncated + * token like `\`) or there are non-stray errors elsewhere. + * + * Closing delimiters (`)`, `]`, `}`) at EOF are always genuinely stray — + * they are complete tokens and cannot become part of a longer valid + * construct by appending more input. + * + * c. The stray token is `=` at the start of a line, which could be the + * beginning of `=begin` (an embedded document). The remaining bytes + * after `=` may parse as an identifier, so the error is not at EOF, + * but the construct is genuinely incomplete. */ -static bool -pm_parse_stream_unterminated_heredoc_p(pm_parser_t *parser) { - pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) parser->error_list.head; +static void +pm_parse_continuable(pm_parser_t *parser) { + // If there are no errors then there is nothing to continue. + if (parser->error_list.size == 0) { + parser->continuable = false; + return; + } - for (; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) { - if (diagnostic->diag_id == PM_ERR_HEREDOC_TERM) { - return true; + if (!parser->continuable) return; + + size_t source_length = (size_t) (parser->end - parser->start); + + // First pass: check if there are any non-stray, non-fatal errors. + bool has_non_stray_error = false; + for (pm_diagnostic_t *error = (pm_diagnostic_t *) parser->error_list.head; error != NULL; error = (pm_diagnostic_t *) error->node.next) { + if (error->diag_id != PM_ERR_UNEXPECTED_TOKEN_IGNORE && error->diag_id != PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT && !pm_parse_err_is_fatal(error->diag_id)) { + has_non_stray_error = true; + break; } } - return false; -} + // Second pass: check each error. We track the minimum source position + // among non-stray, non-fatal errors seen so far in list order, which + // lets us detect cascade stray tokens. + size_t non_stray_min_start = SIZE_MAX; -/** - * Parse a stream of Ruby source and return the tree. - * - * Prism is designed around having the entire source in memory at once, but you - * can stream stdin in to Ruby so we need to support a streaming API. - */ -PRISM_EXPORTED_FUNCTION pm_node_t * -pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const pm_options_t *options) { - pm_buffer_init(buffer); + for (pm_diagnostic_t *error = (pm_diagnostic_t *) parser->error_list.head; error != NULL; error = (pm_diagnostic_t *) error->node.next) { + size_t error_start = (size_t) error->location.start; + size_t error_end = error_start + (size_t) error->location.length; + bool at_eof = error_end >= source_length; - bool eof = pm_parse_stream_read(buffer, stream, stream_fgets); - pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); - pm_node_t *node = pm_parse(parser); + // Fatal errors are non-continuable unless they occur at EOF. + if (pm_parse_err_is_fatal(error->diag_id) && !at_eof) { + parser->continuable = false; + return; + } - while (!eof && parser->error_list.size > 0 && (parser->lex_modes.index > 0 || pm_parse_stream_unterminated_heredoc_p(parser))) { - pm_node_destroy(parser, node); - eof = pm_parse_stream_read(buffer, stream, stream_fgets); + // Track non-stray, non-fatal error positions in list order. + if (error->diag_id != PM_ERR_UNEXPECTED_TOKEN_IGNORE && + error->diag_id != PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT) { + if (error_start < non_stray_min_start) non_stray_min_start = error_start; + continue; + } - pm_parser_free(parser); - pm_parser_init(parser, (const uint8_t *) pm_buffer_value(buffer), pm_buffer_length(buffer), options); - node = pm_parse(parser); + // This is a stray token. Determine if it is a cascade effect + // of a preceding error or genuinely stray. + + // Rule (a): a non-stray error was seen earlier in the list at a + // strictly earlier position — this stray is a cascade effect. + if (non_stray_min_start < error_start) continue; + + // Rule (b): this stray is at EOF with valid code before it. + // Single-byte stray tokens at EOF (like `\` for line continuation) + // are likely truncated tokens. Multi-byte stray tokens (like the + // keyword `end`) need additional evidence that they are cascade + // effects (i.e. non-stray errors exist elsewhere). + if (at_eof && error_start > 0) { + // Exception: closing delimiters at EOF are genuinely stray. + if (error->location.length == 1) { + const uint8_t *byte = parser->start + error_start; + if (*byte == ')' || *byte == ']' || *byte == '}') { + parser->continuable = false; + return; + } + + // Single-byte non-delimiter stray at EOF: cascade. + continue; + } + + // Multi-byte stray at EOF: cascade only if there are + // non-stray errors (evidence of a preceding parse failure). + if (has_non_stray_error) continue; + } + + // Rule (c): a stray `=` at the start of a line could be the + // beginning of an embedded document (`=begin`). The remaining + // bytes after `=` parse as an identifier, so the error is not + // at EOF, but the construct is genuinely incomplete. + if (error->location.length == 1) { + const uint8_t *byte = parser->start + error_start; + if (*byte == '=' && (error_start == 0 || *(byte - 1) == '\n')) continue; + } + + // This stray token is genuinely non-continuable. + parser->continuable = false; + return; } +} +/** + * Parse the Ruby source associated with the given parser and return the tree. + */ +pm_node_t * +pm_parse(pm_parser_t *parser) { + pm_node_t *node = parse_program(parser); + pm_parse_continuable(parser); return node; } /** - * Parse the source and return true if it parses without errors or warnings. + * Parse a stream of Ruby source and return the tree. + * + * Prism is designed around having the entire source in memory at once, but you + * can stream stdin in to Ruby so we need to support a streaming API. */ -PRISM_EXPORTED_FUNCTION bool -pm_parse_success_p(const uint8_t *source, size_t size, const char *data) { - pm_options_t options = { 0 }; - pm_options_read(&options, data); +pm_node_t * +pm_parse_stream(pm_parser_t **parser, pm_arena_t *arena, pm_source_t *source, const pm_options_t *options) { + bool eof = pm_source_stream_read(source); - pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); + pm_parser_t *tmp = pm_parser_new(arena, pm_source_source(source), pm_source_length(source), options); + pm_node_t *node = pm_parse(tmp); - pm_node_t *node = pm_parse(&parser); - pm_node_destroy(&parser, node); + while (!eof && tmp->error_list.size > 0) { + eof = pm_source_stream_read(source); - bool result = parser.error_list.size == 0; - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_free(tmp); + pm_arena_cleanup(arena); - return result; + tmp = pm_parser_new(arena, pm_source_source(source), pm_source_length(source), options); + node = pm_parse(tmp); + } + + *parser = tmp; + return node; } #undef PM_CASE_KEYWORD #undef PM_CASE_OPERATOR #undef PM_CASE_WRITABLE #undef PM_STRING_EMPTY -#undef PM_LOCATION_NODE_BASE_VALUE -#undef PM_LOCATION_NODE_VALUE -#undef PM_LOCATION_NULL_VALUE -#undef PM_LOCATION_TOKEN_VALUE // We optionally support serializing to a binary string. For systems that don't // want or need this functionality, it can be turned off with the // PRISM_EXCLUDE_SERIALIZATION define. #ifndef PRISM_EXCLUDE_SERIALIZATION -static inline void +static PRISM_INLINE void pm_serialize_header(pm_buffer_t *buffer) { pm_buffer_append_string(buffer, "PRISM", 5); pm_buffer_append_byte(buffer, PRISM_VERSION_MAJOR); @@ -22953,7 +22865,7 @@ pm_serialize_header(pm_buffer_t *buffer) { /** * Serialize the AST represented by the given node to the given buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { pm_serialize_header(buffer); pm_serialize_content(parser, node, buffer); @@ -22964,13 +22876,14 @@ pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { * Parse and serialize the AST represented by the given source to the given * buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { pm_options_t options = { 0 }; pm_options_read(&options, data); + pm_arena_t arena = { 0 }; pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); + pm_parser_init(&arena, &parser, source, size, &options); pm_node_t *node = pm_parse(&parser); @@ -22978,216 +22891,53 @@ pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, cons pm_serialize_content(&parser, node, buffer); pm_buffer_append_byte(buffer, '\0'); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_cleanup(&parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); } /** * Parse and serialize the AST represented by the source that is read out of the * given stream into to the given buffer. */ -PRISM_EXPORTED_FUNCTION void -pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const char *data) { - pm_parser_t parser; +void +pm_serialize_parse_stream(pm_buffer_t *buffer, pm_source_t *source, const char *data) { + pm_arena_t arena = { 0 }; + pm_parser_t *parser; pm_options_t options = { 0 }; pm_options_read(&options, data); - pm_buffer_t parser_buffer; - pm_node_t *node = pm_parse_stream(&parser, &parser_buffer, stream, stream_fgets, &options); + pm_node_t *node = pm_parse_stream(&parser, &arena, source, &options); pm_serialize_header(buffer); - pm_serialize_content(&parser, node, buffer); + pm_serialize_content(parser, node, buffer); pm_buffer_append_byte(buffer, '\0'); - pm_node_destroy(&parser, node); - pm_buffer_free(&parser_buffer); - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_free(parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); } /** * Parse and serialize the comments in the given source to the given buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { pm_options_t options = { 0 }; pm_options_read(&options, data); + pm_arena_t arena = { 0 }; pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); + pm_parser_init(&arena, &parser, source, size, &options); - pm_node_t *node = pm_parse(&parser); + pm_parse(&parser); pm_serialize_header(buffer); pm_serialize_encoding(parser.encoding, buffer); pm_buffer_append_varsint(buffer, parser.start_line); - pm_serialize_comment_list(&parser, &parser.comment_list, buffer); + pm_serialize_comment_list(&parser.comment_list, buffer); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_cleanup(&parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); } #endif - -/******************************************************************************/ -/* Slice queries for the Ruby API */ -/******************************************************************************/ - -/** The category of slice returned from pm_slice_type. */ -typedef enum { - /** Returned when the given encoding name is invalid. */ - PM_SLICE_TYPE_ERROR = -1, - - /** Returned when no other types apply to the slice. */ - PM_SLICE_TYPE_NONE, - - /** Returned when the slice is a valid local variable name. */ - PM_SLICE_TYPE_LOCAL, - - /** Returned when the slice is a valid constant name. */ - PM_SLICE_TYPE_CONSTANT, - - /** Returned when the slice is a valid method name. */ - PM_SLICE_TYPE_METHOD_NAME -} pm_slice_type_t; - -/** - * Check that the slice is a valid local variable name or constant. - */ -pm_slice_type_t -pm_slice_type(const uint8_t *source, size_t length, const char *encoding_name) { - // first, get the right encoding object - const pm_encoding_t *encoding = pm_encoding_find((const uint8_t *) encoding_name, (const uint8_t *) (encoding_name + strlen(encoding_name))); - if (encoding == NULL) return PM_SLICE_TYPE_ERROR; - - // check that there is at least one character - if (length == 0) return PM_SLICE_TYPE_NONE; - - size_t width; - if ((width = encoding->alpha_char(source, (ptrdiff_t) length)) != 0) { - // valid because alphabetical - } else if (*source == '_') { - // valid because underscore - width = 1; - } else if ((*source >= 0x80) && ((width = encoding->char_width(source, (ptrdiff_t) length)) > 0)) { - // valid because multibyte - } else { - // invalid because no match - return PM_SLICE_TYPE_NONE; - } - - // determine the type of the slice based on the first character - const uint8_t *end = source + length; - pm_slice_type_t result = encoding->isupper_char(source, end - source) ? PM_SLICE_TYPE_CONSTANT : PM_SLICE_TYPE_LOCAL; - - // next, iterate through all of the bytes of the string to ensure that they - // are all valid identifier characters - source += width; - - while (source < end) { - if ((width = encoding->alnum_char(source, end - source)) != 0) { - // valid because alphanumeric - source += width; - } else if (*source == '_') { - // valid because underscore - source++; - } else if ((*source >= 0x80) && ((width = encoding->char_width(source, end - source)) > 0)) { - // valid because multibyte - source += width; - } else { - // invalid because no match - break; - } - } - - // accept a ! or ? at the end of the slice as a method name - if (*source == '!' || *source == '?' || *source == '=') { - source++; - result = PM_SLICE_TYPE_METHOD_NAME; - } - - // valid if we are at the end of the slice - return source == end ? result : PM_SLICE_TYPE_NONE; -} - -/** - * Check that the slice is a valid local variable name. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t -pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name) { - switch (pm_slice_type(source, length, encoding_name)) { - case PM_SLICE_TYPE_ERROR: - return PM_STRING_QUERY_ERROR; - case PM_SLICE_TYPE_NONE: - case PM_SLICE_TYPE_CONSTANT: - case PM_SLICE_TYPE_METHOD_NAME: - return PM_STRING_QUERY_FALSE; - case PM_SLICE_TYPE_LOCAL: - return PM_STRING_QUERY_TRUE; - } - - assert(false && "unreachable"); - return PM_STRING_QUERY_FALSE; -} - -/** - * Check that the slice is a valid constant name. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t -pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name) { - switch (pm_slice_type(source, length, encoding_name)) { - case PM_SLICE_TYPE_ERROR: - return PM_STRING_QUERY_ERROR; - case PM_SLICE_TYPE_NONE: - case PM_SLICE_TYPE_LOCAL: - case PM_SLICE_TYPE_METHOD_NAME: - return PM_STRING_QUERY_FALSE; - case PM_SLICE_TYPE_CONSTANT: - return PM_STRING_QUERY_TRUE; - } - - assert(false && "unreachable"); - return PM_STRING_QUERY_FALSE; -} - -/** - * Check that the slice is a valid method name. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t -pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name) { -#define B(p) ((p) ? PM_STRING_QUERY_TRUE : PM_STRING_QUERY_FALSE) -#define C1(c) (*source == c) -#define C2(s) (memcmp(source, s, 2) == 0) -#define C3(s) (memcmp(source, s, 3) == 0) - - switch (pm_slice_type(source, length, encoding_name)) { - case PM_SLICE_TYPE_ERROR: - return PM_STRING_QUERY_ERROR; - case PM_SLICE_TYPE_NONE: - break; - case PM_SLICE_TYPE_LOCAL: - // numbered parameters are not valid method names - return B((length != 2) || (source[0] != '_') || (source[1] == '0') || !pm_char_is_decimal_digit(source[1])); - case PM_SLICE_TYPE_CONSTANT: - // all constants are valid method names - case PM_SLICE_TYPE_METHOD_NAME: - // all method names are valid method names - return PM_STRING_QUERY_TRUE; - } - - switch (length) { - case 1: - return B(C1('&') || C1('`') || C1('!') || C1('^') || C1('>') || C1('<') || C1('-') || C1('%') || C1('|') || C1('+') || C1('/') || C1('*') || C1('~')); - case 2: - return B(C2("!=") || C2("!~") || C2("[]") || C2("==") || C2("=~") || C2(">=") || C2(">>") || C2("<=") || C2("<<") || C2("**")); - case 3: - return B(C3("===") || C3("<=>") || C3("[]=")); - default: - return PM_STRING_QUERY_FALSE; - } - -#undef B -#undef C1 -#undef C2 -#undef C3 -} diff --git a/prism/prism.h b/prism/prism.h index 317568aa0c..b342bb32c6 100644 --- a/prism/prism.h +++ b/prism/prism.h @@ -6,281 +6,25 @@ #ifndef PRISM_H #define PRISM_H -#include "prism/defines.h" -#include "prism/util/pm_buffer.h" -#include "prism/util/pm_char.h" -#include "prism/util/pm_integer.h" -#include "prism/util/pm_memchr.h" -#include "prism/util/pm_strncasecmp.h" -#include "prism/util/pm_strpbrk.h" +#ifdef __cplusplus +extern "C" { +#endif + +#include "prism/arena.h" #include "prism/ast.h" +#include "prism/buffer.h" #include "prism/diagnostic.h" +#include "prism/json.h" #include "prism/node.h" #include "prism/options.h" -#include "prism/pack.h" #include "prism/parser.h" #include "prism/prettyprint.h" -#include "prism/regexp.h" -#include "prism/static_literals.h" +#include "prism/serialize.h" +#include "prism/source.h" +#include "prism/stream.h" +#include "prism/string_query.h" #include "prism/version.h" -#include <assert.h> -#include <errno.h> -#include <locale.h> -#include <math.h> -#include <stdarg.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> - -#ifndef _WIN32 -#include <strings.h> -#endif - -/** - * The prism version and the serialization format. - * - * @returns The prism version as a constant string. - */ -PRISM_EXPORTED_FUNCTION const char * pm_version(void); - -/** - * Initialize a parser with the given start and end pointers. - * - * @param parser The parser to initialize. - * @param source The source to parse. - * @param size The size of the source. - * @param options The optional options to use when parsing. - */ -PRISM_EXPORTED_FUNCTION void pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options); - -/** - * Register a callback that will be called whenever prism changes the encoding - * it is using to parse based on the magic comment. - * - * @param parser The parser to register the callback with. - * @param callback The callback to register. - */ -PRISM_EXPORTED_FUNCTION void pm_parser_register_encoding_changed_callback(pm_parser_t *parser, pm_encoding_changed_callback_t callback); - -/** - * Free any memory associated with the given parser. - * - * @param parser The parser to free. - */ -PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser); - -/** - * Initiate the parser with the given parser. - * - * @param parser The parser to use. - * @return The AST representing the source. - */ -PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser); - -/** - * This function is used in pm_parse_stream to retrieve a line of input from a - * stream. It closely mirrors that of fgets so that fgets can be used as the - * default implementation. - */ -typedef char * (pm_parse_stream_fgets_t)(char *string, int size, void *stream); - -/** - * Parse a stream of Ruby source and return the tree. - * - * @param parser The parser to use. - * @param buffer The buffer to use. - * @param stream The stream to parse. - * @param stream_fgets The function to use to read from the stream. - * @param options The optional options to use when parsing. - * @return The AST representing the source. - */ -PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t *parser, pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const pm_options_t *options); - -// We optionally support serializing to a binary string. For systems that don't -// want or need this functionality, it can be turned off with the -// PRISM_EXCLUDE_SERIALIZATION define. -#ifndef PRISM_EXCLUDE_SERIALIZATION - -/** - * Parse and serialize the AST represented by the source that is read out of the - * given stream into to the given buffer. - * - * @param buffer The buffer to serialize to. - * @param stream The stream to parse. - * @param stream_fgets The function to use to read from the stream. - * @param data The optional data to pass to the parser. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize_parse_stream(pm_buffer_t *buffer, void *stream, pm_parse_stream_fgets_t *stream_fgets, const char *data); - -/** - * Serialize the given list of comments to the given buffer. - * - * @param parser The parser to serialize. - * @param list The list of comments to serialize. - * @param buffer The buffer to serialize to. - */ -void pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer); - -/** - * Serialize the name of the encoding to the buffer. - * - * @param encoding The encoding to serialize. - * @param buffer The buffer to serialize to. - */ -void pm_serialize_encoding(const pm_encoding_t *encoding, pm_buffer_t *buffer); - -/** - * Serialize the encoding, metadata, nodes, and constant pool. - * - * @param parser The parser to serialize. - * @param node The node to serialize. - * @param buffer The buffer to serialize to. - */ -void pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); - -/** - * Serialize the AST represented by the given node to the given buffer. - * - * @param parser The parser to serialize. - * @param node The node to serialize. - * @param buffer The buffer to serialize to. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer); - -/** - * Parse the given source to the AST and dump the AST to the given buffer. - * - * @param buffer The buffer to serialize to. - * @param source The source to parse. - * @param size The size of the source. - * @param data The optional data to pass to the parser. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); - -/** - * Parse and serialize the comments in the given source to the given buffer. - * - * @param buffer The buffer to serialize to. - * @param source The source to parse. - * @param size The size of the source. - * @param data The optional data to pass to the parser. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); - -/** - * Lex the given source and serialize to the given buffer. - * - * @param source The source to lex. - * @param size The size of the source. - * @param buffer The buffer to serialize to. - * @param data The optional data to pass to the lexer. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); - -/** - * Parse and serialize both the AST and the tokens represented by the given - * source to the given buffer. - * - * @param buffer The buffer to serialize to. - * @param source The source to parse. - * @param size The size of the source. - * @param data The optional data to pass to the parser. - */ -PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data); - -#endif - -/** - * Parse the source and return true if it parses without errors or warnings. - * - * @param source The source to parse. - * @param size The size of the source. - * @param data The optional data to pass to the parser. - * @return True if the source parses without errors or warnings. - */ -PRISM_EXPORTED_FUNCTION bool pm_parse_success_p(const uint8_t *source, size_t size, const char *data); - -/** - * Returns a string representation of the given token type. - * - * @param token_type The token type to convert to a string. - * @return A string representation of the given token type. - */ -PRISM_EXPORTED_FUNCTION const char * pm_token_type_name(pm_token_type_t token_type); - -/** - * Returns the human name of the given token type. - * - * @param token_type The token type to convert to a human name. - * @return The human name of the given token type. - */ -const char * pm_token_type_human(pm_token_type_t token_type); - -// We optionally support dumping to JSON. For systems that don't want or need -// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. -#ifndef PRISM_EXCLUDE_JSON - -/** - * Dump JSON to the given buffer. - * - * @param buffer The buffer to serialize to. - * @param parser The parser that parsed the node. - * @param node The node to serialize. - */ -PRISM_EXPORTED_FUNCTION void pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node); - -#endif - -/** - * Represents the results of a slice query. - */ -typedef enum { - /** Returned if the encoding given to a slice query was invalid. */ - PM_STRING_QUERY_ERROR = -1, - - /** Returned if the result of the slice query is false. */ - PM_STRING_QUERY_FALSE, - - /** Returned if the result of the slice query is true. */ - PM_STRING_QUERY_TRUE -} pm_string_query_t; - -/** - * Check that the slice is a valid local variable name. - * - * @param source The source to check. - * @param length The length of the source. - * @param encoding_name The name of the encoding of the source. - * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if - * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name); - -/** - * Check that the slice is a valid constant name. - * - * @param source The source to check. - * @param length The length of the source. - * @param encoding_name The name of the encoding of the source. - * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if - * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name); - -/** - * Check that the slice is a valid method name. - * - * @param source The source to check. - * @param length The length of the source. - * @param encoding_name The name of the encoding of the source. - * @return PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if - * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. - */ -PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name); - /** * @mainpage * @@ -289,7 +33,7 @@ PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint * dependencies. It is currently being integrated into * [CRuby](https://github.com/ruby/ruby), * [JRuby](https://github.com/jruby/jruby), - * [TruffleRuby](https://github.com/oracle/truffleruby), + * [TruffleRuby](https://github.com/truffleruby/truffleruby), * [Sorbet](https://github.com/sorbet/sorbet), and * [Syntax Tree](https://github.com/ruby-syntax-tree/syntax_tree). * @@ -303,32 +47,32 @@ PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint * * @section parsing Parsing * - * In order to parse Ruby code, the structures and functions that you're going - * to want to use and be aware of are: + * In order to parse Ruby code, the functions that you are going to want to use + * and be aware of are: * - * * `pm_parser_t` - the main parser structure - * * `pm_parser_init` - initialize a parser - * * `pm_parse` - parse and return the root node - * * `pm_node_destroy` - deallocate the root node returned by `pm_parse` - * * `pm_parser_free` - free the internal memory of the parser + * * `pm_arena_new()` - create a new arena to hold all AST-lifetime allocations + * * `pm_parser_new()` - allocate and initialize a new parser + * * `pm_parse()` - parse and return the root node + * * `pm_parser_free()` - free the parser and its internal memory + * * `pm_arena_free()` - free all AST-lifetime memory * * Putting all of this together would look something like: * * ```c * void parse(const uint8_t *source, size_t length) { - * pm_parser_t parser; - * pm_parser_init(&parser, source, length, NULL); + * pm_arena_t *arena = pm_arena_new(); + * pm_parser_t *parser = pm_parser_new(arena, source, length, NULL); * - * pm_node_t *root = pm_parse(&parser); + * pm_node_t *root = pm_parse(parser); * printf("PARSED!\n"); * - * pm_node_destroy(&parser, root); - * pm_parser_free(&parser); + * pm_parser_free(parser); + * pm_arena_free(arena); * } * ``` * - * All of the nodes "inherit" from `pm_node_t` by embedding those structures as - * their first member. This means you can downcast and upcast any node in the + * All of the nodes "inherit" from `pm_node_t` by embedding those structures + * as their first member. This means you can downcast and upcast any node in the * tree to a `pm_node_t`. * * @section serializing Serializing @@ -336,48 +80,51 @@ PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint * Prism provides the ability to serialize the AST and its related metadata into * a binary format. This format is designed to be portable to different * languages and runtimes so that you only need to make one FFI call in order to - * parse Ruby code. The structures and functions that you're going to want to - * use and be aware of are: + * parse Ruby code. The functions that you are going to want to use and be + * aware of are: * - * * `pm_buffer_t` - a small buffer object that will hold the serialized AST - * * `pm_buffer_free` - free the memory associated with the buffer - * * `pm_serialize` - serialize the AST into a buffer - * * `pm_serialize_parse` - parse and serialize the AST into a buffer + * * `pm_buffer_new()` - create a new buffer + * * `pm_buffer_free()` - free the buffer and its internal memory + * * `pm_serialize_parse()` - parse and serialize the AST into a buffer * * Putting all of this together would look something like: * * ```c * void serialize(const uint8_t *source, size_t length) { - * pm_buffer_t buffer = { 0 }; + * pm_buffer_t *buffer = pm_buffer_new(); * - * pm_serialize_parse(&buffer, source, length, NULL); + * pm_serialize_parse(buffer, source, length, NULL); * printf("SERIALIZED!\n"); * - * pm_buffer_free(&buffer); + * pm_buffer_free(buffer); * } * ``` * * @section inspecting Inspecting * * Prism provides the ability to inspect the AST by pretty-printing nodes. You - * can do this with the `pm_prettyprint` function, which you would use like: + * can do this with the `pm_prettyprint()` function, which you would use like: * * ```c * void prettyprint(const uint8_t *source, size_t length) { - * pm_parser_t parser; - * pm_parser_init(&parser, source, length, NULL); + * pm_arena_t *arena = pm_arena_new(); + * pm_parser_t *parser = pm_parser_new(arena, source, length, NULL); * - * pm_node_t *root = pm_parse(&parser); - * pm_buffer_t buffer = { 0 }; + * pm_node_t *root = pm_parse(parser); + * pm_buffer_t *buffer = pm_buffer_new(); * - * pm_prettyprint(&buffer, &parser, root); - * printf("%*.s\n", (int) buffer.length, buffer.value); + * pm_prettyprint(buffer, parser, root); + * printf("%*.s\n", (int) pm_buffer_length(buffer), pm_buffer_value(buffer)); * - * pm_buffer_free(&buffer); - * pm_node_destroy(&parser, root); - * pm_parser_free(&parser); + * pm_buffer_free(buffer); + * pm_parser_free(parser); + * pm_arena_free(arena); * } * ``` */ +#ifdef __cplusplus +} +#endif + #endif diff --git a/prism/regexp.c b/prism/regexp.c index dcc7476244..cc17aa4d09 100644 --- a/prism/regexp.c +++ b/prism/regexp.c @@ -1,5 +1,20 @@ -#include "prism/regexp.h" - +#include "prism/internal/regexp.h" + +#include "prism/compiler/inline.h" +#include "prism/compiler/fallthrough.h" +#include "prism/internal/buffer.h" +#include "prism/internal/char.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/memchr.h" +#include "prism/internal/parser.h" +#include "prism/internal/stringy.h" +#include "prism/internal/strncasecmp.h" + +#include <assert.h> +#include <string.h> + +/** The maximum depth of nested groups allowed in a regular expression. */ #define PM_REGEXP_PARSE_DEPTH_MAX 4096 /** @@ -18,6 +33,54 @@ typedef struct { /** A pointer to the end of the source that we are parsing. */ const uint8_t *end; + /** The encoding of the source. */ + const pm_encoding_t *encoding; + + /** The callback to call when a named capture group is found. */ + pm_regexp_name_callback_t name_callback; + + /** The data to pass to the name callback. */ + pm_regexp_name_data_t *name_data; + + /** The start of the regexp node (for error locations). */ + const uint8_t *node_start; + + /** The end of the regexp node (for error locations). */ + const uint8_t *node_end; + + /** + * The explicit encoding determined by escape sequences. NULL if no + * encoding-setting escape has been seen, UTF-8 for `\u` escapes, or the + * source encoding for `\x` escapes. + */ + const pm_encoding_t *explicit_encoding; + + /** + * Pointer to the first non-POSIX property name (for /n error messages). + * POSIX properties (Alnum, Alpha, etc.) work in all encodings. + * Script properties (Hiragana, Katakana, etc.) work in /e, /s, /u. + * Unicode-only properties (L, Ll, etc.) work only in /u. + */ + const uint8_t *property_name; + + /** Length of the first non-POSIX property name found. */ + size_t property_name_length; + + /** + * Pointer to the first Unicode-only property name (for /e, /s error + * messages). NULL if only POSIX or script properties have been seen. + */ + const uint8_t *unicode_property_name; + + /** Length of the first Unicode-only property name found. */ + size_t unicode_property_name_length; + + /** Buffer of hex escape byte values >= 0x80, separated by 0x00 sentinels. */ + pm_buffer_t hex_escape_buffer; + + /** Count of non-ASCII literal bytes (not from escapes). */ + uint32_t non_ascii_literal_count; + /** * Whether or not the regular expression currently being parsed is in * extended mode, wherein whitespace is ignored and comments are allowed. @@ -27,31 +90,77 @@ typedef struct { /** Whether the encoding has changed from the default. */ bool encoding_changed; - /** The encoding of the source. */ - const pm_encoding_t *encoding; + /** Whether the source content is shared (for named capture callback). */ + bool shared; - /** The callback to call when a named capture group is found. */ - pm_regexp_name_callback_t name_callback; + /** Whether a `\u{...}` escape with value >= 0x80 was seen. */ + bool has_unicode_escape; - /** The data to pass to the name callback. */ - void *name_data; + /** Whether a `\xNN` escape (or `\M-x`, etc.) with value >= 0x80 was seen. */ + bool has_hex_escape; + + /** + * Tracks whether the last encoding-setting escape was `\u` (true) or `\x` + * (false). This matters for error messages when both types are mixed. + */ + bool last_escape_was_unicode; + + /** Whether any `\p{...}` or `\P{...}` property escape was found. */ + bool has_property_escape; + + /** Whether a Unicode-only property escape was found (not POSIX or script). */ + bool has_unicode_property_escape; - /** The callback to call when a parse error is found. */ - pm_regexp_error_callback_t error_callback; + /** Whether a `\u` escape with invalid range (surrogate or > 0x10FFFF) was seen. */ + bool invalid_unicode_range; - /** The data to pass to the error callback. */ - void *error_data; + /** Whether we are accumulating consecutive hex escape bytes. */ + bool hex_group_active; + + /** Whether an invalid multibyte character was found during parsing. */ + bool has_invalid_multibyte; } pm_regexp_parser_t; /** - * Append an error to the parser. + * Append a syntax error to the parser's error list. If the source is shared + * (points into the original source), we can point to the exact error location. + * Otherwise, we point to the whole regexp node. */ -static inline void +static PRISM_INLINE void pm_regexp_parse_error(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, const char *message) { - parser->error_callback(start, end, message, parser->error_data); + pm_parser_t *pm = parser->parser; + uint32_t loc_start, loc_length; + + if (parser->shared) { + loc_start = (uint32_t) (start - pm->start); + loc_length = (uint32_t) (end - start); + } else { + loc_start = (uint32_t) (parser->node_start - pm->start); + loc_length = (uint32_t) (parser->node_end - parser->node_start); + } + + pm_diagnostic_list_append_format(&pm->metadata_arena, &pm->error_list, loc_start, loc_length, PM_ERR_REGEXP_PARSE_ERROR, message); } /** + * Append a formatted diagnostic error with proper shared/non-shared location + * handling. This is a macro because we need variadic args for the format string. + */ +#define pm_regexp_parse_error_format(parser_, err_start_, err_end_, diag_id, ...) \ + do { \ + pm_parser_t *pm__ = (parser_)->parser; \ + uint32_t loc_start__, loc_length__; \ + if ((parser_)->shared) { \ + loc_start__ = (uint32_t) ((err_start_) - pm__->start); \ + loc_length__ = (uint32_t) ((err_end_) - (err_start_)); \ + } else { \ + loc_start__ = (uint32_t) ((parser_)->node_start - pm__->start); \ + loc_length__ = (uint32_t) ((parser_)->node_end - (parser_)->node_start); \ + } \ + pm_diagnostic_list_append_format(&pm__->metadata_arena, &pm__->error_list, loc_start__, loc_length__, diag_id, __VA_ARGS__); \ + } while (0) + +/** * This appends a new string to the list of named captures. This function * assumes the caller has already checked the validity of the name callback. */ @@ -59,14 +168,14 @@ static void pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) { pm_string_t string; pm_string_shared_init(&string, start, end); - parser->name_callback(&string, parser->name_data); - pm_string_free(&string); + parser->name_callback(parser->parser, &string, parser->shared, parser->name_data); + pm_string_cleanup(&string); } /** * Returns true if the next character is the end of the source. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { return parser->cursor >= parser->end; } @@ -74,7 +183,7 @@ pm_regexp_char_is_eof(pm_regexp_parser_t *parser) { /** * Optionally accept a char and consume it if it exists. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { parser->cursor++; @@ -86,7 +195,7 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) { /** * Expect a character to be present and consume it. */ -static inline bool +static PRISM_INLINE bool pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) { if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) { parser->cursor++; @@ -114,6 +223,47 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { } /** + * Mark a group boundary in the hex escape byte buffer. When consecutive hex + * escape bytes >= 0x80 are followed by a non-hex-escape, this appends a 0x00 + * sentinel to separate the groups for later multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_hex_group_boundary(pm_regexp_parser_t *parser) { + if (parser->hex_group_active) { + pm_buffer_append_byte(&parser->hex_escape_buffer, 0x00); + parser->hex_group_active = false; + } +} + +/** + * Track a hex escape byte value >= 0x80 for multibyte validation. + */ +static PRISM_INLINE void +pm_regexp_track_hex_escape(pm_regexp_parser_t *parser, uint8_t byte) { + if (byte >= 0x80) { + pm_buffer_append_byte(&parser->hex_escape_buffer, byte); + parser->hex_group_active = true; + parser->has_hex_escape = true; + + parser->explicit_encoding = parser->encoding; + parser->last_escape_was_unicode = false; + } else { + pm_regexp_hex_group_boundary(parser); + } +} + +/** + * Parse a hex digit character and return its value, or -1 if not a hex digit. + */ +static PRISM_INLINE int +pm_regexp_hex_digit_value(uint8_t byte) { + if (byte >= '0' && byte <= '9') return byte - '0'; + if (byte >= 'a' && byte <= 'f') return byte - 'a' + 10; + if (byte >= 'A' && byte <= 'F') return byte - 'A' + 10; + return -1; +} + +/** * Range quantifiers are a special class of quantifiers that look like * * * {digit} @@ -121,13 +271,12 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) { * * {digit,digit} * * {,digit} * - * Unfortunately, if there are any spaces in between, then this just becomes a - * regular character match expression and we have to backtrack. So when this - * function first starts running, we'll create a "save" point and then attempt - * to parse the quantifier. If it fails, we'll restore the save point and - * return. + * If there are any spaces in between, then this just becomes a regular + * character match expression and we have to backtrack. So when this function + * first starts running, we'll create a "save" point and then attempt to parse + * the quantifier. If it fails, we'll restore the save point and return. * - * The properly track everything, we're going to build a little state machine. + * To properly track everything, we're going to build a little state machine. * It looks something like the following: * * +-------+ +---------+ ------------+ @@ -275,11 +424,393 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) { ); } +/** + * Property escape classification. Onigmo supports three tiers of property + * names depending on the encoding: + * + * - POSIX properties (Alnum, Alpha, ASCII, Blank, Cntrl, Digit, Graph, Lower, + * Print, Punct, Space, Upper, XDigit, Word): valid in all encodings. + * - Script properties (Hiragana, Katakana, Han, Latin, Greek, Cyrillic): valid + * in EUC-JP (/e), Windows-31J (/s), and UTF-8 (/u), but not ASCII-8BIT (/n). + * - Unicode-only properties (general categories like L, Ll, Lu, etc., plus + * Any, Assigned): valid only in UTF-8 (/u). + */ +typedef enum { + PM_REGEXP_PROPERTY_POSIX, + PM_REGEXP_PROPERTY_SCRIPT, + PM_REGEXP_PROPERTY_UNICODE +} pm_regexp_property_type_t; + +/** + * Classify a property name. The name may start with '^' for negation, which + * is skipped before matching. + */ +static pm_regexp_property_type_t +pm_regexp_classify_property(const uint8_t *name, size_t length) { + // Skip leading '^' for negated properties like \p{^Hiragana}. + if (length > 0 && name[0] == '^') { + name++; + length--; + } + +#define PM_REGEXP_CASECMP(str_) (pm_strncasecmp(name, (const uint8_t *) (str_), length) == 0) + + switch (length) { + case 3: + if (PM_REGEXP_CASECMP("Han")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 4: + if (PM_REGEXP_CASECMP("Word")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 5: + /* Most properties are length 5, so dispatch on first character. */ + switch (name[0] | 0x20) { + case 'a': + if (PM_REGEXP_CASECMP("Alnum")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Alpha")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("ASCII")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'b': + if (PM_REGEXP_CASECMP("Blank")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'c': + if (PM_REGEXP_CASECMP("Cntrl")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'd': + if (PM_REGEXP_CASECMP("Digit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'g': + if (PM_REGEXP_CASECMP("Graph")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Greek")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'l': + if (PM_REGEXP_CASECMP("Lower")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Latin")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + case 'p': + if (PM_REGEXP_CASECMP("Print")) return PM_REGEXP_PROPERTY_POSIX; + if (PM_REGEXP_CASECMP("Punct")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 's': + if (PM_REGEXP_CASECMP("Space")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 'u': + if (PM_REGEXP_CASECMP("Upper")) return PM_REGEXP_PROPERTY_POSIX; + break; + } + break; + case 6: + if (PM_REGEXP_CASECMP("XDigit")) return PM_REGEXP_PROPERTY_POSIX; + break; + case 8: + if (PM_REGEXP_CASECMP("Hiragana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Katakana")) return PM_REGEXP_PROPERTY_SCRIPT; + if (PM_REGEXP_CASECMP("Cyrillic")) return PM_REGEXP_PROPERTY_SCRIPT; + break; + } + +#undef PM_REGEXP_CASECMP + + // Everything else is Unicode-only (general categories, other scripts, etc.). + return PM_REGEXP_PROPERTY_UNICODE; +} + +/** + * Check for and skip a `\p{...}` or `\P{...}` Unicode property escape. The + * cursor should be pointing at 'p' or 'P' when this is called. If a property + * escape is found, record it on the regexp parser and advance past the closing + * '}'. + * + * Properties are classified into three tiers (POSIX, script, Unicode-only) to + * determine which encoding modifiers they are valid with. + */ +static bool +pm_regexp_parse_property_escape(pm_regexp_parser_t *parser) { + assert(*parser->cursor == 'p' || *parser->cursor == 'P'); + + if (parser->cursor + 1 < parser->end && parser->cursor[1] == '{') { + const uint8_t *name_start = parser->cursor + 2; + const uint8_t *search = name_start; + + while (search < parser->end && *search != '}') search++; + + if (search < parser->end) { + size_t name_length = (size_t) (search - name_start); + parser->has_property_escape = true; + + pm_regexp_property_type_t type = pm_regexp_classify_property(name_start, name_length); + + // Track the first non-POSIX property name (for /n error messages). + if (type >= PM_REGEXP_PROPERTY_SCRIPT && parser->property_name == NULL) { + parser->property_name = name_start; + parser->property_name_length = name_length; + } + + // Track the first Unicode-only property name (for /e, /s error messages). + if (type == PM_REGEXP_PROPERTY_UNICODE) { + parser->has_unicode_property_escape = true; + if (parser->unicode_property_name == NULL) { + parser->unicode_property_name = name_start; + parser->unicode_property_name_length = name_length; + } + } + + parser->cursor = search + 1; // skip past '}' + return true; + } + } + + // Not a property escape, just skip the single character after '\'. + parser->cursor++; + return false; +} + +/** + * Validate and skip a \u escape sequence in a regular expression. The cursor + * should be pointing at the character after 'u' when this is called. This + * handles both the \u{NNNN MMMM} and \uNNNN forms. Also tracks encoding + * state for validation. + */ +static void +pm_regexp_parse_unicode_escape(pm_regexp_parser_t *parser) { + const uint8_t *escape_start = parser->cursor - 2; // points to '\' + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + return; + } + + if (*parser->cursor == '{') { + parser->cursor++; // skip '{' + + // Skip leading whitespace. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + + bool has_codepoint = false; + + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + // Parse the hex digits to compute the codepoint value. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (!pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count == 0) { + // Skip to '}' or end of regexp to find the full extent. + while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '}') { + parser->cursor++; + } + + const uint8_t *escape_end = parser->cursor; + if (!pm_regexp_char_is_eof(parser)) { + escape_end++; + parser->cursor++; // skip '}' + } + + pm_regexp_parse_error_format(parser, escape_start, escape_end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (escape_end - escape_start), (const char *) escape_start); + return; + } + + if (hex_count > 6) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode range"); + } + + // Track encoding state for this codepoint. + if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range (surrogates or > 0x10FFFF). + if (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF)) { + parser->invalid_unicode_range = true; + } + + has_codepoint = true; + + // Skip whitespace between codepoints. + while (!pm_regexp_char_is_eof(parser) && pm_char_is_whitespace(*parser->cursor)) { + parser->cursor++; + } + } + + if (pm_regexp_char_is_eof(parser)) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "unterminated Unicode escape"); + } else { + if (!has_codepoint) { + pm_regexp_parse_error_format(parser, escape_start, parser->cursor + 1, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->cursor + 1 - escape_start), (const char *) escape_start); + } + parser->cursor++; // skip '}' + } + } else { + // \uNNNN form — need exactly 4 hex digits. + uint32_t value = 0; + size_t hex_count = 0; + + int digit; + while (hex_count < 4 && !pm_regexp_char_is_eof(parser) && (digit = pm_regexp_hex_digit_value(*parser->cursor)) >= 0) { + value = (value << 4) | (uint32_t) digit; + hex_count++; + parser->cursor++; + } + + if (hex_count < 4) { + pm_regexp_parse_error(parser, escape_start, parser->cursor, "invalid Unicode escape"); + } else if (value >= 0x80) { + parser->has_unicode_escape = true; + parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY; + parser->last_escape_was_unicode = true; + pm_regexp_hex_group_boundary(parser); + } + + // Check for invalid Unicode range. + if (hex_count == 4 && (value > 0x10FFFF || (value >= 0xD800 && value <= 0xDFFF))) { + parser->invalid_unicode_range = true; + } + } +} + // Forward declaration because character sets can be nested. static bool pm_regexp_parse_lbracket(pm_regexp_parser_t *parser, uint16_t depth); /** + * Parse a \x escape and return the byte value. The cursor should be pointing + * at the character after 'x'. Returns -1 if no hex digits follow. + */ +static int +pm_regexp_parse_hex_escape(pm_regexp_parser_t *parser) { + int value = -1; + + if (!pm_regexp_char_is_eof(parser)) { + int digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = digit; + parser->cursor++; + + if (!pm_regexp_char_is_eof(parser)) { + digit = pm_regexp_hex_digit_value(*parser->cursor); + if (digit >= 0) { + value = (value << 4) | digit; + parser->cursor++; + } + } + } + } + + if (value >= 0) { + pm_regexp_track_hex_escape(parser, (uint8_t) value); + } + + return value; +} + +/** + * Parse a backslash escape sequence in a regexp, handling \u (unicode), + * \p/\P (property), \x (hex), and other single-character escapes. Also + * tracks encoding state for \M-x and \C-\M-x escapes. + */ +static void +pm_regexp_parse_backslash_escape(pm_regexp_parser_t *parser) { + if (pm_regexp_char_is_eof(parser)) return; + + switch (*parser->cursor) { + case 'u': + parser->cursor++; // skip 'u' + pm_regexp_parse_unicode_escape(parser); + break; + case 'p': + case 'P': + pm_regexp_parse_property_escape(parser); + break; + case 'x': + parser->cursor++; // skip 'x' + pm_regexp_parse_hex_escape(parser); + break; + case 'M': + // \M-x produces (x | 0x80), always >= 0x80 + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'M-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + // \M-\C-x or \M-\cx — the resulting byte is always >= 0x80 + // We just need to track it as a hex escape >= 0x80. + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + // \M-x always produces a byte >= 0x80 + pm_regexp_track_hex_escape(parser, 0x80); + } + } else { + parser->cursor++; + } + break; + case 'C': + // \C-x produces (x & 0x1F) + if (parser->cursor + 2 < parser->end && parser->cursor[1] == '-') { + parser->cursor += 2; // skip 'C-' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + } else { + parser->cursor++; + } + break; + case 'c': + // \cx produces (x & 0x1F) + parser->cursor++; // skip 'c' + if (!pm_regexp_char_is_eof(parser)) { + if (*parser->cursor == '\\') { + parser->cursor++; + pm_regexp_parse_backslash_escape(parser); + } else { + parser->cursor++; + } + } + break; + default: + pm_regexp_hex_group_boundary(parser); + parser->cursor++; + break; + } +} + +/** + * Check if a byte at the current position is a non-ASCII byte in a multibyte + * encoding that produces an invalid character. If so, emit an error at the + * byte location immediately. + */ +static void +pm_regexp_parse_invalid_multibyte(pm_regexp_parser_t *parser, const uint8_t *cursor) { + uint8_t byte = *cursor; + if (byte >= 0x80 && parser->encoding_changed && parser->encoding->multibyte) { + size_t width = parser->encoding->char_width(cursor, (ptrdiff_t) (parser->end - cursor)); + if (width > 1) { + parser->cursor += width - 1; + } else if (width == 0) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, cursor, cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } +} + +/** * match-char-set : '[' '^'? (match-range | match-char)* ']' * ; */ @@ -293,12 +824,16 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser, uint16_t depth) { pm_regexp_parse_lbracket(parser, (uint16_t) (depth + 1)); break; case '\\': - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); break; default: - // do nothing, we've already advanced the cursor + // We've already advanced the cursor by one byte. If the byte + // was >= 0x80 in a multibyte encoding, we may need to consume + // additional continuation bytes and validate the character. + if (*(parser->cursor - 1) >= 0x80) { + parser->non_ascii_literal_count++; + } + pm_regexp_parse_invalid_multibyte(parser, parser->cursor - 1); break; } } @@ -354,8 +889,13 @@ typedef enum { // These are the options that are configurable on the regular expression (or // from within a group). +/** The minimum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a' + +/** The maximum character value for a regexp option slot. */ #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x' + +/** The number of regexp option slots. */ #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1) /** @@ -498,7 +1038,15 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser, uint16_t depth) { } size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); - if (width == 0) return false; + if (width == 0) { + if (*parser->cursor >= 0x80) { + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + parser->cursor++; + continue; + } + return false; + } escaped = (width == 1) && (*parser->cursor == '\\'); parser->cursor += width; @@ -686,9 +1234,7 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { return pm_regexp_parse_quantifier(parser); case '\\': parser->cursor++; - if (!pm_regexp_char_is_eof(parser)) { - parser->cursor++; - } + pm_regexp_parse_backslash_escape(parser); return pm_regexp_parse_quantifier(parser); case '(': parser->cursor++; @@ -720,9 +1266,30 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser, uint16_t depth) { width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor)); } - if (width == 0) return false; // TODO: add appropriate error - parser->cursor += width; + if (width == 0) { + if (*parser->cursor >= 0x80 && parser->encoding_changed) { + if (parser->encoding->multibyte) { + // Invalid multibyte character in a multibyte encoding. + // Emit the error at the byte location immediately. + parser->has_invalid_multibyte = true; + pm_regexp_parse_error_format(parser, parser->cursor, parser->cursor + 1, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } else { + // Non-ASCII byte in a single-byte encoding (e.g., + // US-ASCII). Count it for later error reporting. + parser->non_ascii_literal_count++; + } + parser->cursor++; + return pm_regexp_parse_quantifier(parser); + } + return false; + } + + // Count non-ASCII literal bytes. + for (size_t i = 0; i < width; i++) { + if (parser->cursor[i] >= 0x80) parser->non_ascii_literal_count++; + } + parser->cursor += width; return pm_regexp_parse_quantifier(parser); } } @@ -768,13 +1335,354 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) { return pm_regexp_char_is_eof(parser); } +// --------------------------------------------------------------------------- +// Encoding validation +// --------------------------------------------------------------------------- + /** - * Parse a regular expression and extract the names of all of the named capture - * groups. + * Validate that groups of hex escape bytes in the buffer form valid multibyte + * characters in the given encoding. Groups are separated by 0x00 sentinels. + */ +static bool +pm_regexp_validate_hex_escapes(const pm_encoding_t *encoding, const pm_buffer_t *buffer) { + const uint8_t *data = (const uint8_t *) pm_buffer_value(buffer); + size_t len = pm_buffer_length(buffer); + size_t i = 0; + + while (i < len) { + size_t group_start = i; + while (i < len && data[i] != 0x00) i++; + + for (size_t j = group_start; j < i; ) { + size_t width = encoding->char_width(data + j, (ptrdiff_t) (i - j)); + if (width == 0) return false; + j += width; + } + + if (i < len) i++; // skip sentinel + } + + return true; +} + +/** + * Format regexp source content for use in error messages, hex-escaping + * non-ASCII bytes. + */ +static void +pm_regexp_format_for_error(pm_buffer_t *buffer, const pm_encoding_t *encoding, const uint8_t *source, size_t length) { + size_t index = 0; + + if (encoding == PM_ENCODING_UTF_8_ENTRY) { + pm_buffer_append_string(buffer, (const char *) source, length); + return; + } + + while (index < length) { + if (source[index] < 0x80) { + pm_buffer_append_byte(buffer, source[index]); + index++; + } else if (encoding->multibyte) { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (length - index)); + + if (width > 1) { + pm_buffer_append_string(buffer, "\\x{", 3); + for (size_t i = 0; i < width; i++) { + pm_buffer_append_format(buffer, "%02X", source[index + i]); + } + pm_buffer_append_byte(buffer, '}'); + index += width; + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } else { + pm_buffer_append_format(buffer, "\\x%02X", source[index]); + index++; + } + } +} + +/** + * Emit an encoding validation error on the regexp node. + */ +#define PM_REGEXP_ENCODING_ERROR(parser, diag_id, ...) \ + pm_diagnostic_list_append_format( \ + &(parser)->parser->metadata_arena, \ + &(parser)->parser->error_list, \ + (uint32_t) ((parser)->node_start - (parser)->parser->start), \ + (uint32_t) ((parser)->node_end - (parser)->node_start), \ + diag_id, __VA_ARGS__) + +/** + * Validate encoding for a regexp with an encoding modifier (/e, /s, /u, /n). + * + * The decision tree is: + * + * 1. No escape-set encoding (explicit_encoding == NULL): + * a. ASCII-only content: validate property escapes, return forced US-ASCII + * for /n or the modifier flags for others. + * b. US-ASCII source with non-ASCII literals: emit per-byte errors. + * c. Source encoding differs from modifier encoding: emit mismatch error. + * + * 2. Mixed \u and \x escapes: emit the appropriate conflict error depending + * on the modifier and which escape type was last. + * + * 3. \u escape with non-/u modifier: incompatible encoding error. + * + * 4. Validate that hex escape byte sequences form valid multibyte characters + * in the modifier's encoding. + */ +static pm_node_flags_t +pm_regexp_validate_encoding_modifier(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding, const char *source_start, int source_length) { + + if (parser->explicit_encoding == NULL) { + if (ascii_only) { + // Check property escapes against the modifier's encoding tier. + // /n (ASCII-8BIT): only POSIX properties are valid. + // /e, /s: POSIX and script properties are valid. + // /u: all properties are valid. + if (modifier == 'n' && parser->property_name != NULL) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->property_name_length, (const char *) parser->property_name, + source_length, source_start); + } else if (modifier != 'u' && parser->has_unicode_property_escape) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_CHAR_PROPERTY, + (int) parser->unicode_property_name_length, (const char *) parser->unicode_property_name, + source_length, source_start); + } + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + pm_buffer_t formatted = { 0 }; + pm_regexp_format_for_error(&formatted, parser->encoding, (const uint8_t *) source_start, (size_t) source_length); + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_NON_ESCAPED_MBC, (int) formatted.length, (const char *) formatted.value); + pm_buffer_cleanup(&formatted); + } + } + + return flags; + } + + // Mixed unicode + hex escapes. + if (parser->has_unicode_escape && parser->has_hex_escape) { + if (modifier == 'n') { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + } else { + if (!pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + + return flags; + } + + if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } else if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, source_length, source_start); + } + } + + if (modifier != 'n' && !pm_regexp_validate_hex_escapes(modifier_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return flags; +} + +/** + * Validate encoding for a regexp without a modifier and compute the encoding + * flags to set on the node. + * + * The decision tree is: + * + * 1. If a modifier (/n, /u, /e, /s) is present, delegate to + * pm_regexp_validate_encoding_modifier. + * 2. Invalid multibyte chars or unicode ranges: suppress further checks (errors + * were already emitted during parsing). + * 3. US-ASCII source with non-ASCII literals: emit per-byte errors. + * 4. ASCII-only content: return forced US-ASCII (or forced UTF-8 if \p{...}). + * 5. Escape-set encoding present: validate hex escapes against the target + * encoding, handle mixed \u + \x conflicts, and return the appropriate + * forced encoding flag. + */ +static pm_node_flags_t +pm_regexp_validate_encoding(pm_regexp_parser_t *parser, bool ascii_only, pm_node_flags_t flags, const char *source_start, int source_length) { + + // Invalid multibyte characters suppress further validation. + // Errors were already emitted at the byte locations during parsing. + if (parser->has_invalid_multibyte) { + return flags; + } + + if (parser->invalid_unicode_range) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, source_length, source_start); + return flags; + } + + // Check modifier flags first. + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY, source_start, source_length); + } + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return pm_regexp_validate_encoding_modifier(parser, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY, source_start, source_length); + } + + // No modifier — check for non-ASCII literals in US-ASCII encoding. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) { + for (uint32_t i = 0; i < parser->non_ascii_literal_count; i++) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } + + // ASCII-only regexps get downgraded to US-ASCII, unless property escapes + // force UTF-8. + if (ascii_only) { + if (parser->has_property_escape) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; + } + + // Check explicit encoding from escape sequences. + if (parser->explicit_encoding != NULL) { + // Mixed unicode + hex escapes without modifier. + if (parser->has_unicode_escape && parser->has_hex_escape && parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + if (parser->encoding != PM_ENCODING_US_ASCII_ENTRY && + parser->encoding != PM_ENCODING_ASCII_8BIT_ENTRY && + !pm_regexp_validate_hex_escapes(parser->encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } else if (parser->last_escape_was_unicode) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP, source_length, source_start); + } else { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8, source_length, source_start); + } + + return 0; + } + + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; + } else { + if (!pm_regexp_validate_hex_escapes(parser->explicit_encoding, &parser->hex_escape_buffer)) { + PM_REGEXP_ENCODING_ERROR(parser, PM_ERR_INVALID_MULTIBYTE_ESCAPE, source_length, source_start); + } + } + } + + return 0; +} + +/** + * Parse a regular expression, validate its encoding, and optionally extract + * named capture groups. Encoding validation walks the raw source (content_loc) + * to distinguish escape-produced bytes from literal bytes. Named capture + * extraction walks the unescaped content since escape sequences in group names + * (e.g., line continuations) have already been processed by the lexer. + */ +pm_node_flags_t +pm_regexp_parse(pm_parser_t *parser, pm_regular_expression_node_t *node, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + const uint8_t *source = parser->start + node->content_loc.start; + size_t size = node->content_loc.length; + bool extended_mode = PM_NODE_FLAG_P(node, PM_REGULAR_EXPRESSION_FLAGS_EXTENDED); + pm_node_flags_t flags = PM_NODE_FLAGS(node); + + const uint8_t *node_start = parser->start + node->base.location.start; + const uint8_t *node_end = parser->start + node->base.location.start + node->base.location.length; + + // First pass: walk raw source for encoding validation (no name extraction). + pm_regexp_parser_t regexp_parser = { + .parser = parser, + .start = source, + .cursor = source, + .end = source + size, + .extended_mode = extended_mode, + .encoding_changed = parser->encoding_changed, + .encoding = parser->encoding, + .name_callback = NULL, + .name_data = NULL, + .shared = true, + .node_start = node_start, + .node_end = node_end, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + + // Compute ascii_only from the regexp parser's tracked state. We cannot + // use node->unescaped for this because regexp unescaped content preserves + // escape text (e.g., \x80 is 4 ASCII chars), not the binary values. + bool ascii_only = !regexp_parser.has_hex_escape && !regexp_parser.has_unicode_escape && regexp_parser.non_ascii_literal_count == 0; + // Use the unescaped content for error messages to match CRuby's format, + // where Ruby escapes like \M-\C-? are resolved to bytes but regexp escapes + // like \u{80} are preserved as text. + const char *error_source = (const char *) pm_string_source(&node->unescaped); + int error_source_length = (int) pm_string_length(&node->unescaped); + pm_node_flags_t encoding_flags = pm_regexp_validate_encoding(®exp_parser, ascii_only, flags, error_source, error_source_length); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); + + // Second pass: walk unescaped content for named capture extraction. + if (name_callback != NULL) { + bool shared = node->unescaped.type == PM_STRING_SHARED; + pm_regexp_parse_named_captures(parser, pm_string_source(&node->unescaped), pm_string_length(&node->unescaped), shared, extended_mode, name_callback, name_data); + } + + return encoding_flags; +} + +/** + * Parse an interpolated regular expression for named capture groups only. + * This is used for the =~ operator with interpolated regexps where we don't + * have a pm_regular_expression_node_t. No encoding validation is performed. + * + * Note: The encoding-tracking fields (has_unicode_escape, has_hex_escape, etc.) + * are initialized but not used for the result. They exist because the parsing + * functions (pm_regexp_parse_backslash_escape, etc.) unconditionally update + * them as they walk through the content. */ -PRISM_EXPORTED_FUNCTION void -pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data) { - pm_regexp_parse_pattern(&(pm_regexp_parser_t) { +void +pm_regexp_parse_named_captures(pm_parser_t *parser, const uint8_t *source, size_t size, bool shared, bool extended_mode, pm_regexp_name_callback_t name_callback, pm_regexp_name_data_t *name_data) { + pm_regexp_parser_t regexp_parser = { .parser = parser, .start = source, .cursor = source, @@ -784,7 +1692,26 @@ pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool ex .encoding = parser->encoding, .name_callback = name_callback, .name_data = name_data, - .error_callback = error_callback, - .error_data = error_data - }); + .shared = shared, + .node_start = source, + .node_end = source + size, + .has_unicode_escape = false, + .has_hex_escape = false, + .last_escape_was_unicode = false, + .explicit_encoding = NULL, + .has_property_escape = false, + .has_unicode_property_escape = false, + .property_name = NULL, + .property_name_length = 0, + .unicode_property_name = NULL, + .unicode_property_name_length = 0, + .non_ascii_literal_count = 0, + .invalid_unicode_range = false, + .hex_escape_buffer = { 0 }, + .hex_group_active = false, + .has_invalid_multibyte = false, + }; + + pm_regexp_parse_pattern(®exp_parser); + pm_buffer_cleanup(®exp_parser.hex_escape_buffer); } diff --git a/prism/regexp.h b/prism/regexp.h deleted file mode 100644 index c0b3163e93..0000000000 --- a/prism/regexp.h +++ /dev/null @@ -1,43 +0,0 @@ -/** - * @file regexp.h - * - * A regular expression parser. - */ -#ifndef PRISM_REGEXP_H -#define PRISM_REGEXP_H - -#include "prism/defines.h" -#include "prism/parser.h" -#include "prism/encoding.h" -#include "prism/util/pm_memchr.h" -#include "prism/util/pm_string.h" - -#include <stdbool.h> -#include <stddef.h> -#include <string.h> - -/** - * This callback is called when a named capture group is found. - */ -typedef void (*pm_regexp_name_callback_t)(const pm_string_t *name, void *data); - -/** - * This callback is called when a parse error is found. - */ -typedef void (*pm_regexp_error_callback_t)(const uint8_t *start, const uint8_t *end, const char *message, void *data); - -/** - * Parse a regular expression. - * - * @param parser The parser that is currently being used. - * @param source The source code to parse. - * @param size The size of the source code. - * @param extended_mode Whether to parse the regular expression in extended mode. - * @param name_callback The optional callback to call when a named capture group is found. - * @param name_data The optional data to pass to the name callback. - * @param error_callback The callback to call when a parse error is found. - * @param error_data The data to pass to the error callback. - */ -PRISM_EXPORTED_FUNCTION void pm_regexp_parse(pm_parser_t *parser, const uint8_t *source, size_t size, bool extended_mode, pm_regexp_name_callback_t name_callback, void *name_data, pm_regexp_error_callback_t error_callback, void *error_data); - -#endif diff --git a/prism/serialize.h b/prism/serialize.h new file mode 100644 index 0000000000..786a1514bc --- /dev/null +++ b/prism/serialize.h @@ -0,0 +1,96 @@ +/** + * @file serialize.h + * + * The functions related to serializing the AST to a binary format. + */ +#ifndef PRISM_SERIALIZE_H +#define PRISM_SERIALIZE_H + +#include "prism/excludes.h" + +/* We optionally support serializing to a binary string. For systems that do not + * want or need this functionality, it can be turned off with the + * PRISM_EXCLUDE_SERIALIZATION define. */ +#ifndef PRISM_EXCLUDE_SERIALIZATION + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include "prism/buffer.h" +#include "prism/parser.h" +#include "prism/source.h" +#include "prism/stream.h" + +/** + * Serialize the AST represented by the given node to the given buffer. + * + * @param parser The parser to serialize. + * @param node The node to serialize. + * @param buffer The buffer to serialize to. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) PRISM_NONNULL(1, 2, 3); + +/** + * Parse the given source to the AST and dump the AST to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) PRISM_NONNULL(1, 2); + +/** + * Parse and serialize the AST represented by the given source into the given + * buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_stream(pm_buffer_t *buffer, pm_source_t *source, const char *data) PRISM_NONNULL(1, 2); + +/** + * Parse and serialize the comments in the given source to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_comments(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) PRISM_NONNULL(1, 2); + +/** + * Lex the given source and serialize to the given buffer. + * + * @param source The source to lex. + * @param size The size of the source. + * @param buffer The buffer to serialize to. + * @param data The optional data to pass to the lexer. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) PRISM_NONNULL(1, 2); + +/** + * Parse and serialize both the AST and the tokens represented by the given + * source to the given buffer. + * + * @param buffer The buffer to serialize to. + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + */ +PRISM_EXPORTED_FUNCTION void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) PRISM_NONNULL(1, 2); + +/** + * Parse the source and return true if it parses without errors or warnings. + * + * @param source The source to parse. + * @param size The size of the source. + * @param data The optional data to pass to the parser. + * @returns True if the source parses without errors or warnings. + */ +PRISM_EXPORTED_FUNCTION bool pm_serialize_parse_success_p(const uint8_t *source, size_t size, const char *data) PRISM_NONNULL(1); + +#endif + +#endif diff --git a/prism/source.c b/prism/source.c new file mode 100644 index 0000000000..f61cb19c1b --- /dev/null +++ b/prism/source.c @@ -0,0 +1,491 @@ +#include "prism/internal/source.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/buffer.h" + +#include <stdlib.h> +#include <string.h> + +/* The following headers are necessary to read files using demand paging. */ +#ifdef _WIN32 +#include <windows.h> +#elif defined(_POSIX_MAPPED_FILES) +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#elif defined(PRISM_HAS_FILESYSTEM) +#include <fcntl.h> +#include <sys/stat.h> +#endif + +static const uint8_t empty_source[] = ""; + +/** + * Allocate and initialize a pm_source_t with the given fields. + */ +static pm_source_t * +pm_source_alloc(const uint8_t *source, size_t length, pm_source_type_t type) { + pm_source_t *result = xmalloc(sizeof(pm_source_t)); + if (result == NULL) abort(); + + *result = (struct pm_source_t) { + .source = source, + .length = length, + .type = type + }; + + return result; +} + +/** + * Create a new source that wraps existing constant memory. + */ +pm_source_t * +pm_source_constant_new(const uint8_t *data, size_t length) { + return pm_source_alloc(data, length, PM_SOURCE_CONSTANT); +} + +/** + * Create a new source that wraps existing shared memory. + */ +pm_source_t * +pm_source_shared_new(const uint8_t *data, size_t length) { + return pm_source_alloc(data, length, PM_SOURCE_SHARED); +} + +/** + * Create a new source that owns its memory. + */ +pm_source_t * +pm_source_owned_new(uint8_t *data, size_t length) { + return pm_source_alloc(data, length, PM_SOURCE_OWNED); +} + +#ifdef _WIN32 +/** + * Represents a file handle on Windows, where the path will need to be freed + * when the file is closed. + */ +typedef struct { + /** The path to the file, which will become allocated memory. */ + WCHAR *path; + + /** The size of the allocated path in bytes. */ + size_t path_size; + + /** The handle to the file, which will start as uninitialized memory. */ + HANDLE file; +} pm_source_file_handle_t; + +/** + * Open the file indicated by the filepath parameter for reading on Windows. + */ +static pm_source_init_result_t +pm_source_file_handle_open(pm_source_file_handle_t *handle, const char *filepath) { + int length = MultiByteToWideChar(CP_UTF8, 0, filepath, -1, NULL, 0); + if (length == 0) return PM_SOURCE_INIT_ERROR_GENERIC; + + handle->path_size = sizeof(WCHAR) * ((size_t) length); + handle->path = xmalloc(handle->path_size); + if ((handle->path == NULL) || (MultiByteToWideChar(CP_UTF8, 0, filepath, -1, handle->path, length) == 0)) { + xfree_sized(handle->path, handle->path_size); + return PM_SOURCE_INIT_ERROR_GENERIC; + } + + handle->file = CreateFileW(handle->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); + if (handle->file == INVALID_HANDLE_VALUE) { + pm_source_init_result_t result = PM_SOURCE_INIT_ERROR_GENERIC; + + if (GetLastError() == ERROR_ACCESS_DENIED) { + DWORD attributes = GetFileAttributesW(handle->path); + if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = PM_SOURCE_INIT_ERROR_DIRECTORY; + } + } + + xfree_sized(handle->path, handle->path_size); + return result; + } + + return PM_SOURCE_INIT_SUCCESS; +} + +/** + * Close the file handle and free the path. + */ +static void +pm_source_file_handle_close(pm_source_file_handle_t *handle) { + xfree_sized(handle->path, handle->path_size); + CloseHandle(handle->file); +} +#endif + +/** + * Create a new source by memory-mapping a file. + */ +pm_source_t * +pm_source_mapped_new(const char *filepath, int open_flags, pm_source_init_result_t *result) { +#ifdef _WIN32 + (void) open_flags; + + /* Open the file for reading. */ + pm_source_file_handle_t handle; + *result = pm_source_file_handle_open(&handle, filepath); + if (*result != PM_SOURCE_INIT_SUCCESS) return NULL; + + /* Get the file size. */ + DWORD file_size = GetFileSize(handle.file, NULL); + if (file_size == INVALID_FILE_SIZE) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* If the file is empty, then return a constant source. */ + if (file_size == 0) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(empty_source, 0, PM_SOURCE_CONSTANT); + } + + /* Create a mapping of the file. */ + HANDLE mapping = CreateFileMapping(handle.file, NULL, PAGE_READONLY, 0, 0, NULL); + if (mapping == NULL) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Map the file into memory. */ + uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); + CloseHandle(mapping); + pm_source_file_handle_close(&handle); + + if (source == NULL) { + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(source, (size_t) file_size, PM_SOURCE_MAPPED); +#elif defined(_POSIX_MAPPED_FILES) + /* Open the file for reading. */ + int fd = open(filepath, O_RDONLY | open_flags); + if (fd == -1) { + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Stat the file to get the file size. */ + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Ensure it is a file and not a directory. */ + if (S_ISDIR(sb.st_mode)) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_DIRECTORY; + return NULL; + } + + /* + * For non-regular files (pipes, character devices), return a specific + * error so the caller can handle reading through their own I/O layer. + */ + if (!S_ISREG(sb.st_mode)) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_NON_REGULAR; + return NULL; + } + + /* mmap the file descriptor to virtually get the contents. */ + size_t size = (size_t) sb.st_size; + + if (size == 0) { + close(fd); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(empty_source, 0, PM_SOURCE_CONSTANT); + } + + uint8_t *source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (source == MAP_FAILED) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + close(fd); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(source, size, PM_SOURCE_MAPPED); +#else + (void) open_flags; + return pm_source_file_new(filepath, result); +#endif +} + +/** + * Create a new source by reading a file into a heap-allocated buffer. + */ +pm_source_t * +pm_source_file_new(const char *filepath, pm_source_init_result_t *result) { +#ifdef _WIN32 + /* Open the file for reading. */ + pm_source_file_handle_t handle; + *result = pm_source_file_handle_open(&handle, filepath); + if (*result != PM_SOURCE_INIT_SUCCESS) return NULL; + + /* Get the file size. */ + const DWORD file_size = GetFileSize(handle.file, NULL); + if (file_size == INVALID_FILE_SIZE) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* If the file is empty, return a constant source. */ + if (file_size == 0) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(empty_source, 0, PM_SOURCE_CONSTANT); + } + + /* Create a buffer to read the file into. */ + uint8_t *source = xmalloc(file_size); + if (source == NULL) { + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Read the contents of the file. */ + DWORD bytes_read; + if (!ReadFile(handle.file, source, file_size, &bytes_read, NULL)) { + xfree_sized(source, file_size); + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Check the number of bytes read. */ + if (bytes_read != file_size) { + xfree_sized(source, file_size); + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + pm_source_file_handle_close(&handle); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(source, (size_t) file_size, PM_SOURCE_OWNED); +#elif defined(PRISM_HAS_FILESYSTEM) + /* Open the file for reading. */ + int fd = open(filepath, O_RDONLY); + if (fd == -1) { + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Stat the file to get the file size. */ + struct stat sb; + if (fstat(fd, &sb) == -1) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + /* Ensure it is a file and not a directory. */ + if (S_ISDIR(sb.st_mode)) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_DIRECTORY; + return NULL; + } + + /* Check the size to see if it's empty. */ + size_t size = (size_t) sb.st_size; + if (size == 0) { + close(fd); + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(empty_source, 0, PM_SOURCE_CONSTANT); + } + + const size_t length = (size_t) size; + uint8_t *source = xmalloc(length); + if (source == NULL) { + close(fd); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + ssize_t bytes_read = read(fd, source, length); + close(fd); + + if (bytes_read == -1 || (size_t) bytes_read != length) { + xfree_sized(source, length); + *result = PM_SOURCE_INIT_ERROR_GENERIC; + return NULL; + } + + *result = PM_SOURCE_INIT_SUCCESS; + return pm_source_alloc(source, length, PM_SOURCE_OWNED); +#else + (void) filepath; + *result = PM_SOURCE_INIT_ERROR_GENERIC; + perror("pm_source_file_new is not implemented for this platform"); + return NULL; +#endif +} + +/** + * Create a new source by reading from a stream. This allocates the source + * but does not read from the stream yet. Use pm_source_stream_read to read + * data. + */ +pm_source_t * +pm_source_stream_new(void *stream, pm_source_stream_fgets_t *fgets, pm_source_stream_feof_t *feof) { + pm_source_t *source = pm_source_alloc(NULL, 0, PM_SOURCE_STREAM); + source->stream.buffer = pm_buffer_new(); + source->stream.stream = stream; + source->stream.fgets = fgets; + source->stream.feof = feof; + source->stream.eof = false; + + return source; +} + +/** + * Read from the stream into the source's internal buffer until __END__ is + * encountered or EOF is reached. Updates the source pointer and length. + * + * Returns true if EOF was reached, false if __END__ was encountered. + */ +bool +pm_source_stream_read(pm_source_t *source) { + pm_buffer_t *buffer = source->stream.buffer; + +#define LINE_SIZE 4096 + char line[LINE_SIZE]; + + while (memset(line, '\n', LINE_SIZE), source->stream.fgets(line, LINE_SIZE, source->stream.stream) != NULL) { + size_t length = LINE_SIZE; + while (length > 0 && line[length - 1] == '\n') length--; + + if (length == LINE_SIZE) { + /* + * If we read a line that is the maximum size and it doesn't end + * with a newline, then we'll just append it to the buffer and + * continue reading. + */ + length--; + pm_buffer_append_string(buffer, line, length); + continue; + } + + /* Append the line to the buffer. */ + length--; + pm_buffer_append_string(buffer, line, length); + + /* + * Check if the line matches the __END__ marker. If it does, then stop + * reading and return false. In most circumstances, this means we should + * stop reading from the stream so that the DATA constant can pick it + * up. + */ + switch (length) { + case 7: + if (strncmp(line, "__END__", 7) == 0) { + source->source = (const uint8_t *) pm_buffer_value(buffer); + source->length = pm_buffer_length(buffer); + return false; + } + break; + case 8: + if (strncmp(line, "__END__\n", 8) == 0) { + source->source = (const uint8_t *) pm_buffer_value(buffer); + source->length = pm_buffer_length(buffer); + return false; + } + break; + case 9: + if (strncmp(line, "__END__\r\n", 9) == 0) { + source->source = (const uint8_t *) pm_buffer_value(buffer); + source->length = pm_buffer_length(buffer); + return false; + } + break; + } + + /* + * All data should be read via gets. If the string returned by gets + * _doesn't_ end with a newline, then we assume we hit EOF condition. + */ + if (source->stream.feof(source->stream.stream)) { + break; + } + } + +#undef LINE_SIZE + + source->stream.eof = true; + source->source = (const uint8_t *) pm_buffer_value(buffer); + source->length = pm_buffer_length(buffer); + return true; +} + +/** + * Returns whether the stream source has reached EOF. + */ +bool +pm_source_stream_eof(const pm_source_t *source) { + return source->stream.eof; +} + +/** + * Free the given source and any memory it owns. + */ +void +pm_source_free(pm_source_t *source) { + switch (source->type) { + case PM_SOURCE_CONSTANT: + case PM_SOURCE_SHARED: + /* No cleanup needed for the data. */ + break; + case PM_SOURCE_OWNED: + xfree_sized((void *) source->source, source->length); + break; + case PM_SOURCE_MAPPED: +#if defined(_WIN32) + if (source->length > 0) { + UnmapViewOfFile((void *) source->source); + } +#elif defined(_POSIX_MAPPED_FILES) + if (source->length > 0) { + munmap((void *) source->source, source->length); + } +#endif + break; + case PM_SOURCE_STREAM: + pm_buffer_free(source->stream.buffer); + break; + } + + xfree_sized(source, sizeof(pm_source_t)); +} + +/** + * Returns the length of the source data in bytes. + */ +size_t +pm_source_length(const pm_source_t *source) { + return source->length; +} + +/** + * Returns a pointer to the source data. + */ +const uint8_t * +pm_source_source(const pm_source_t *source) { + return source->source; +} diff --git a/prism/source.h b/prism/source.h new file mode 100644 index 0000000000..c79987d3fb --- /dev/null +++ b/prism/source.h @@ -0,0 +1,148 @@ +/** + * @file source.h + * + * An opaque type representing the source code being parsed, regardless of + * origin (constant memory, file, memory-mapped file, or stream). + */ +#ifndef PRISM_SOURCE_H +#define PRISM_SOURCE_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/filesystem.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> +#include <stdint.h> + +/** + * An opaque type representing source code being parsed. + */ +typedef struct pm_source_t pm_source_t; + +/** + * This function is used to retrieve a line of input from a stream. It closely + * mirrors that of fgets so that fgets can be used as the default + * implementation. + */ +typedef char * (pm_source_stream_fgets_t)(char *string, int size, void *stream); + +/** + * This function is used to check whether a stream is at EOF. It closely mirrors + * that of feof so that feof can be used as the default implementation. + */ +typedef int (pm_source_stream_feof_t)(void *stream); + +/** + * Represents the result of initializing a source from a file. + */ +typedef enum { + /** Indicates that the source was successfully initialized. */ + PM_SOURCE_INIT_SUCCESS = 0, + + /** + * Indicates a generic error from a source init function, where the type + * of error should be read from `errno` or `GetLastError()`. + */ + PM_SOURCE_INIT_ERROR_GENERIC = 1, + + /** + * Indicates that the file that was attempted to be opened was a directory. + */ + PM_SOURCE_INIT_ERROR_DIRECTORY = 2, + + /** + * Indicates that the file is not a regular file (e.g. a pipe or character + * device) and the caller should handle reading it. + */ + PM_SOURCE_INIT_ERROR_NON_REGULAR = 3 +} pm_source_init_result_t; + +/** + * Create a new source that wraps existing constant memory. The memory is not + * owned and will not be freed. + * + * @param data The pointer to the source data. + * @param length The length of the source data in bytes. + * @returns A new source. Aborts on allocation failure. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_constant_new(const uint8_t *data, size_t length); + +/** + * Create a new source that wraps existing shared memory. The memory is not + * owned and will not be freed. Semantically a "slice" of another source. + * + * @param data The pointer to the source data. + * @param length The length of the source data in bytes. + * @returns A new source. Aborts on allocation failure. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_shared_new(const uint8_t *data, size_t length); + +/** + * Create a new source that owns its memory. The memory will be freed with + * xfree when the source is freed. + * + * @param data The pointer to the heap-allocated source data. + * @param length The length of the source data in bytes. + * @returns A new source. Aborts on allocation failure. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_owned_new(uint8_t *data, size_t length); + +/** + * Create a new source by reading a file into a heap-allocated buffer. + * + * @param filepath The path to the file to read. + * @param result Out parameter for the result of the initialization. + * @returns A new source, or NULL on error (with result written to out param). + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_file_new(const char *filepath, pm_source_init_result_t *result) PRISM_NONNULL(1, 2); + +/** + * Create a new source by memory-mapping a file. Falls back to file reading on + * platforms without mmap support. + * + * If the file is a non-regular file (e.g. a pipe or character device), + * PM_SOURCE_INIT_ERROR_NON_REGULAR is returned, allowing the caller to handle + * it appropriately (e.g. by reading it through their own I/O layer). + * + * @param filepath The path to the file to read. + * @param open_flags Additional flags to pass to open(2) (e.g. O_NONBLOCK). + * @param result Out parameter for the result of the initialization. + * @returns A new source, or NULL on error (with result written to out param). + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_mapped_new(const char *filepath, int open_flags, pm_source_init_result_t *result) PRISM_NONNULL(1, 3); + +/** + * Create a new source by reading from a stream using the provided callbacks. + * + * @param stream The stream to read from. + * @param fgets The function to use to read from the stream. + * @param feof The function to use to check if the stream is at EOF. + * @returns A new source. Aborts on allocation failure. + */ +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_source_t * pm_source_stream_new(void *stream, pm_source_stream_fgets_t *fgets, pm_source_stream_feof_t *feof); + +/** + * Free the given source and any memory it owns. + * + * @param source The source to free. + */ +PRISM_EXPORTED_FUNCTION void pm_source_free(pm_source_t *source) PRISM_NONNULL(1); + +/** + * Returns the length of the source data in bytes. + * + * @param source The source to get the length of. + * @returns The length of the source data. + */ +PRISM_EXPORTED_FUNCTION size_t pm_source_length(const pm_source_t *source) PRISM_NONNULL(1); + +/** + * Returns a pointer to the source data. + * + * @param source The source to get the data of. + * @returns A pointer to the source data. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_source_source(const pm_source_t *source) PRISM_NONNULL(1); + +#endif diff --git a/prism/srcs.mk b/prism/srcs.mk new file mode 100644 index 0000000000..93ad8f579f --- /dev/null +++ b/prism/srcs.mk @@ -0,0 +1,160 @@ +PRISM_TEMPLATES_DIR = $(PRISM_SRCDIR)/templates +PRISM_TEMPLATE = $(PRISM_TEMPLATES_DIR)/template.rb +PRISM_CONFIG = $(PRISM_SRCDIR)/config.yml + +srcs uncommon.mk: prism/.srcs.mk.time + +prism/.srcs.mk.time: $(order_only) $(PRISM_BUILD_DIR)/.time +prism/$(HAVE_BASERUBY:no=.srcs.mk.time): + touch $@ +prism/$(HAVE_BASERUBY:yes=.srcs.mk.time): \ + $(PRISM_SRCDIR)/templates/template.rb \ + $(PRISM_SRCDIR)/srcs.mk.in + $(BASERUBY) $(tooldir)/generic_erb.rb -c -t$@ -o $(PRISM_SRCDIR)/srcs.mk $(PRISM_SRCDIR)/srcs.mk.in + +distclean-prism-srcs:: + $(RM) prism/.srcs.mk.time + $(RMDIRS) prism || $(NULLCMD) + +distclean-srcs-local:: distclean-prism-srcs + +realclean-prism-srcs:: distclean-prism-srcs + $(RM) $(PRISM_SRCDIR)/srcs.mk + +realclean-srcs-local:: realclean-prism-srcs + +main srcs: prism-srcs +main incs: prism-incs + +prism-srcs: $(srcdir)/prism/api_node.c +$(srcdir)/prism/api_node.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/ext/prism/api_node.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) ext/prism/api_node.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/api_node.c + +prism-incs: $(srcdir)/prism/ast.h +$(srcdir)/prism/ast.h: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/include/prism/ast.h.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) include/prism/ast.h $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/ast.h + +prism-incs: $(srcdir)/prism/internal/diagnostic.h +$(srcdir)/prism/internal/diagnostic.h: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/include/prism/internal/diagnostic.h.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) include/prism/internal/diagnostic.h $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/internal/diagnostic.h + +prism-srcs: $(srcdir)/lib/prism/compiler.rb +$(srcdir)/lib/prism/compiler.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/compiler.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/compiler.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/compiler.rb + +prism-srcs: $(srcdir)/lib/prism/dispatcher.rb +$(srcdir)/lib/prism/dispatcher.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dispatcher.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dispatcher.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dispatcher.rb + +prism-srcs: $(srcdir)/lib/prism/dot_visitor.rb +$(srcdir)/lib/prism/dot_visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dot_visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dot_visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dot_visitor.rb + +prism-srcs: $(srcdir)/lib/prism/dsl.rb +$(srcdir)/lib/prism/dsl.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/dsl.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/dsl.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/dsl.rb + +prism-srcs: $(srcdir)/lib/prism/inspect_visitor.rb +$(srcdir)/lib/prism/inspect_visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/inspect_visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/inspect_visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/inspect_visitor.rb + +prism-srcs: $(srcdir)/lib/prism/mutation_compiler.rb +$(srcdir)/lib/prism/mutation_compiler.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/mutation_compiler.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/mutation_compiler.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/mutation_compiler.rb + +prism-srcs: $(srcdir)/lib/prism/node.rb +$(srcdir)/lib/prism/node.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/node.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/node.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/node.rb + +prism-srcs: $(srcdir)/lib/prism/reflection.rb +$(srcdir)/lib/prism/reflection.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/reflection.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/reflection.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/reflection.rb + +prism-srcs: $(srcdir)/lib/prism/serialize.rb +$(srcdir)/lib/prism/serialize.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/serialize.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/serialize.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/serialize.rb + +prism-srcs: $(srcdir)/lib/prism/visitor.rb +$(srcdir)/lib/prism/visitor.rb: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/lib/prism/visitor.rb.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) lib/prism/visitor.rb $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/lib/prism/visitor.rb + +prism-srcs: $(srcdir)/prism/diagnostic.c +$(srcdir)/prism/diagnostic.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/diagnostic.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/diagnostic.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/diagnostic.c + +prism-srcs: $(srcdir)/prism/json.c +$(srcdir)/prism/json.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/json.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/json.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/json.c + +prism-srcs: $(srcdir)/prism/node.c +$(srcdir)/prism/node.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/node.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/node.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/node.c + +prism-srcs: $(srcdir)/prism/prettyprint.c +$(srcdir)/prism/prettyprint.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/prettyprint.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/prettyprint.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/prettyprint.c + +prism-srcs: $(srcdir)/prism/serialize.c +$(srcdir)/prism/serialize.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/serialize.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/serialize.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/serialize.c + +prism-srcs: $(srcdir)/prism/tokens.c +$(srcdir)/prism/tokens.c: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/src/tokens.c.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) src/tokens.c $@ + +realclean-prism-srcs:: + $(RM) $(srcdir)/prism/tokens.c diff --git a/prism/srcs.mk.in b/prism/srcs.mk.in new file mode 100644 index 0000000000..6149e4ae9d --- /dev/null +++ b/prism/srcs.mk.in @@ -0,0 +1,52 @@ +<% # -*- ruby -*- +# :stopdoc: +require_relative 'templates/template' + +script = File.basename(__FILE__) +srcs = output ? File.basename(output) : script.chomp('.in') +mk = 'uncommon.mk' + +# %> +PRISM_TEMPLATES_DIR = $(PRISM_SRCDIR)/templates +PRISM_TEMPLATE = $(PRISM_TEMPLATES_DIR)/template.rb +PRISM_CONFIG = $(PRISM_SRCDIR)/config.yml + +srcs <%=%><%=mk%>: prism/.srcs.mk.time + +prism/.srcs.mk.time: $(order_only) $(PRISM_BUILD_DIR)/.time +prism/$(HAVE_BASERUBY:no=.srcs.mk.time): + touch $@ +prism/$(HAVE_BASERUBY:yes=.srcs.mk.time): \ + $(PRISM_SRCDIR)/templates/template.rb \ + $(PRISM_SRCDIR)/<%=%><%=script%> + $(BASERUBY) $(tooldir)/generic_erb.rb -c -t$@ -o $(PRISM_SRCDIR)/<%=%><%=srcs%> $(PRISM_SRCDIR)/<%=%><%=script%> + +distclean-prism-srcs:: + $(RM) prism/.srcs.mk.time + $(RMDIRS) prism || $(NULLCMD) + +distclean-srcs-local:: distclean-prism-srcs + +realclean-prism-srcs:: distclean-prism-srcs + $(RM) $(PRISM_SRCDIR)/<%=%><%=srcs%> + +realclean-srcs-local:: realclean-prism-srcs + +main srcs: prism-srcs +main incs: prism-incs +<% Prism::Template::TEMPLATES.map do |t| + /\.(?:[ch]|rb)\z/ =~ t or next + s = '$(srcdir)/' + t.sub(%r[\A(?:(src)|ext|include)/]) {$1 && 'prism/'} + s.sub!(%r[\A\$(srcdir)/prism/], '$(PRISM_SRCDIR)/') + target = s.end_with?('.h') ? 'incs' : 'srcs' +# %> + +prism-<%=%><%=target%>: <%=%><%=s%> +<%=%><%=s%>: $(PRISM_CONFIG) $(PRISM_TEMPLATE) $(PRISM_TEMPLATES_DIR)/<%=%><%=t%>.erb + $(Q) $(BASERUBY) $(PRISM_TEMPLATE) <%=%><%=t%> $@ + +realclean-prism-srcs:: + $(RM) <%=%><%=s%> +<% +end +# %> diff --git a/prism/static_literals.c b/prism/static_literals.c index 9fa37b999a..9af1eadf5d 100644 --- a/prism/static_literals.c +++ b/prism/static_literals.c @@ -1,4 +1,18 @@ -#include "prism/static_literals.h" +#include "prism/internal/static_literals.h" + +#include "prism/compiler/inline.h" +#include "prism/compiler/unused.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/buffer.h" +#include "prism/internal/integer.h" +#include "prism/internal/isinf.h" +#include "prism/internal/stringy.h" + +#include <assert.h> +#include <math.h> +#include <stdlib.h> +#include <string.h> /** * A small struct used for passing around a subset of the information that is @@ -7,7 +21,10 @@ */ typedef struct { /** The list of newline offsets to use to calculate line numbers. */ - const pm_newline_list_t *newline_list; + const pm_line_offset_list_t *line_offsets; + + /** The start of the source being parsed. */ + const uint8_t *start; /** The line number that the parser starts on. */ int32_t start_line; @@ -16,7 +33,7 @@ typedef struct { const char *encoding_name; } pm_static_literals_metadata_t; -static inline uint32_t +static PRISM_INLINE uint32_t murmur_scramble(uint32_t value) { value *= 0xcc9e2d51; value = (value << 15) | (value >> 17); @@ -92,7 +109,7 @@ node_hash(const pm_static_literals_metadata_t *metadata, const pm_node_t *node) } case PM_SOURCE_LINE_NODE: { // Source lines hash their line number. - const pm_line_column_t line_column = pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line); + const pm_line_column_t line_column = pm_line_offset_list_line_column(metadata->line_offsets, node->location.start, metadata->start_line); const int32_t *value = &line_column.line; return murmur_hash((const uint8_t *) value, sizeof(int32_t)); } @@ -180,7 +197,7 @@ pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *m } // Finally, free the old node list and update the hash. - xfree(hash->nodes); + xfree_sized(hash->nodes, hash->capacity * sizeof(pm_node_t *)); hash->nodes = new_nodes; hash->capacity = new_capacity; } @@ -218,7 +235,7 @@ pm_node_hash_insert(pm_node_hash_t *hash, const pm_static_literals_metadata_t *m */ static void pm_node_hash_free(pm_node_hash_t *hash) { - if (hash->capacity > 0) xfree(hash->nodes); + if (hash->capacity > 0) xfree_sized(hash->nodes, hash->capacity * sizeof(pm_node_t *)); } /** @@ -240,7 +257,7 @@ pm_int64_value(const pm_static_literals_metadata_t *metadata, const pm_node_t *n return integer->negative ? -value : value; } case PM_SOURCE_LINE_NODE: - return (int64_t) pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line).line; + return (int64_t) pm_line_offset_list_line_column(metadata->line_offsets, node->location.start, metadata->start_line).line; default: assert(false && "unreachable"); return 0; @@ -268,7 +285,7 @@ pm_compare_integer_nodes(const pm_static_literals_metadata_t *metadata, const pm * A comparison function for comparing two FloatNode instances. */ static int -pm_compare_float_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { +pm_compare_float_nodes(PRISM_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { const double left_value = ((const pm_float_node_t *) left)->value; const double right_value = ((const pm_float_node_t *) right)->value; return PM_NUMERIC_COMPARISON(left_value, right_value); @@ -327,7 +344,7 @@ pm_string_value(const pm_node_t *node) { * A comparison function for comparing two nodes that have attached strings. */ static int -pm_compare_string_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { +pm_compare_string_nodes(PRISM_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { const pm_string_t *left_string = pm_string_value(left); const pm_string_t *right_string = pm_string_value(right); return pm_string_compare(left_string, right_string); @@ -337,7 +354,7 @@ pm_compare_string_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata * A comparison function for comparing two RegularExpressionNode instances. */ static int -pm_compare_regular_expression_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { +pm_compare_regular_expression_nodes(PRISM_UNUSED const pm_static_literals_metadata_t *metadata, const pm_node_t *left, const pm_node_t *right) { const pm_regular_expression_node_t *left_regexp = (const pm_regular_expression_node_t *) left; const pm_regular_expression_node_t *right_regexp = (const pm_regular_expression_node_t *) right; @@ -353,14 +370,15 @@ pm_compare_regular_expression_nodes(PRISM_ATTRIBUTE_UNUSED const pm_static_liter * Add a node to the set of static literals. */ pm_node_t * -pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace) { +pm_static_literals_add(const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, pm_static_literals_t *literals, pm_node_t *node, bool replace) { switch (PM_NODE_TYPE(node)) { case PM_INTEGER_NODE: case PM_SOURCE_LINE_NODE: return pm_node_hash_insert( &literals->integer_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -372,7 +390,8 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line return pm_node_hash_insert( &literals->float_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -385,7 +404,8 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line return pm_node_hash_insert( &literals->number_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -398,7 +418,8 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line return pm_node_hash_insert( &literals->string_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -410,7 +431,8 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line return pm_node_hash_insert( &literals->regexp_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -422,7 +444,8 @@ pm_static_literals_add(const pm_newline_list_t *newline_list, int32_t start_line return pm_node_hash_insert( &literals->symbol_nodes, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = NULL }, @@ -492,7 +515,7 @@ pm_static_literal_positive_p(const pm_node_t *node) { /** * Create a string-based representation of the given static literal. */ -static inline void +static PRISM_INLINE void pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_metadata_t *metadata, const pm_node_t *node) { switch (PM_NODE_TYPE(node)) { case PM_FALSE_NODE: @@ -502,12 +525,12 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met const double value = ((const pm_float_node_t *) node)->value; if (PRISM_ISINF(value)) { - if (*node->location.start == '-') { + if (metadata->start[node->location.start] == '-') { pm_buffer_append_byte(buffer, '-'); } pm_buffer_append_string(buffer, "Infinity", 8); } else if (value == 0.0) { - if (*node->location.start == '-') { + if (metadata->start[node->location.start] == '-') { pm_buffer_append_byte(buffer, '-'); } pm_buffer_append_string(buffer, "0.0", 3); @@ -576,7 +599,7 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met break; } case PM_SOURCE_LINE_NODE: - pm_buffer_append_format(buffer, "%d", pm_newline_list_line_column(metadata->newline_list, node->location.start, metadata->start_line).line); + pm_buffer_append_format(buffer, "%d", pm_line_offset_list_line_column(metadata->line_offsets, node->location.start, metadata->start_line).line); break; case PM_STRING_NODE: { const pm_string_t *unescaped = &((const pm_string_node_t *) node)->unescaped; @@ -604,11 +627,12 @@ pm_static_literal_inspect_node(pm_buffer_t *buffer, const pm_static_literals_met * Create a string-based representation of the given static literal. */ void -pm_static_literal_inspect(pm_buffer_t *buffer, const pm_newline_list_t *newline_list, int32_t start_line, const char *encoding_name, const pm_node_t *node) { +pm_static_literal_inspect(pm_buffer_t *buffer, const pm_line_offset_list_t *line_offsets, const uint8_t *start, int32_t start_line, const char *encoding_name, const pm_node_t *node) { pm_static_literal_inspect_node( buffer, &(pm_static_literals_metadata_t) { - .newline_list = newline_list, + .line_offsets = line_offsets, + .start = start, .start_line = start_line, .encoding_name = encoding_name }, diff --git a/prism/stream.h b/prism/stream.h new file mode 100644 index 0000000000..678322b442 --- /dev/null +++ b/prism/stream.h @@ -0,0 +1,28 @@ +/** + * @file stream.h + * + * Functions for parsing streams. + */ +#ifndef PRISM_STREAM_H +#define PRISM_STREAM_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include "prism/arena.h" +#include "prism/options.h" +#include "prism/parser.h" +#include "prism/source.h" + +/** + * Parse a stream of Ruby source and return the tree. + * + * @param parser The out parameter to write the parser to. + * @param arena The arena to use for all AST-lifetime allocations. + * @param source The source to use, created via pm_source_stream_new. + * @param options The optional options to use when parsing. + * @returns The AST representing the source. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse_stream(pm_parser_t **parser, pm_arena_t *arena, pm_source_t *source, const pm_options_t *options) PRISM_NONNULL(1, 2, 3); + +#endif diff --git a/prism/string_query.c b/prism/string_query.c new file mode 100644 index 0000000000..ccedaf9c00 --- /dev/null +++ b/prism/string_query.c @@ -0,0 +1,166 @@ +#include "prism/string_query.h" + +#include "prism/internal/char.h" +#include "prism/internal/encoding.h" + +#include <assert.h> +#include <string.h> + +/** The category of slice returned from pm_slice_type. */ +typedef enum { + /** Returned when the given encoding name is invalid. */ + PM_SLICE_TYPE_ERROR = -1, + + /** Returned when no other types apply to the slice. */ + PM_SLICE_TYPE_NONE, + + /** Returned when the slice is a valid local variable name. */ + PM_SLICE_TYPE_LOCAL, + + /** Returned when the slice is a valid constant name. */ + PM_SLICE_TYPE_CONSTANT, + + /** Returned when the slice is a valid method name. */ + PM_SLICE_TYPE_METHOD_NAME +} pm_slice_type_t; + +/** + * Check that the slice is a valid local variable name or constant. + */ +static pm_slice_type_t +pm_slice_type(const uint8_t *source, size_t length, const char *encoding_name) { + // first, get the right encoding object + const pm_encoding_t *encoding = pm_encoding_find((const uint8_t *) encoding_name, (const uint8_t *) (encoding_name + strlen(encoding_name))); + if (encoding == NULL) return PM_SLICE_TYPE_ERROR; + + // check that there is at least one character + if (length == 0) return PM_SLICE_TYPE_NONE; + + size_t width; + if ((width = encoding->alpha_char(source, (ptrdiff_t) length)) != 0) { + // valid because alphabetical + } else if (*source == '_') { + // valid because underscore + width = 1; + } else if ((*source >= 0x80) && ((width = encoding->char_width(source, (ptrdiff_t) length)) > 0)) { + // valid because multibyte + } else { + // invalid because no match + return PM_SLICE_TYPE_NONE; + } + + // determine the type of the slice based on the first character + const uint8_t *end = source + length; + pm_slice_type_t result = encoding->isupper_char(source, end - source) ? PM_SLICE_TYPE_CONSTANT : PM_SLICE_TYPE_LOCAL; + + // next, iterate through all of the bytes of the string to ensure that they + // are all valid identifier characters + source += width; + + while (source < end) { + if ((width = encoding->alnum_char(source, end - source)) != 0) { + // valid because alphanumeric + source += width; + } else if (*source == '_') { + // valid because underscore + source++; + } else if ((*source >= 0x80) && ((width = encoding->char_width(source, end - source)) > 0)) { + // valid because multibyte + source += width; + } else { + // invalid because no match + break; + } + } + + // accept a ! or ? at the end of the slice as a method name + if (*source == '!' || *source == '?' || *source == '=') { + source++; + result = PM_SLICE_TYPE_METHOD_NAME; + } + + // valid if we are at the end of the slice + return source == end ? result : PM_SLICE_TYPE_NONE; +} + +/** + * Check that the slice is a valid local variable name. + */ +pm_string_query_t +pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name) { + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + case PM_SLICE_TYPE_CONSTANT: + case PM_SLICE_TYPE_METHOD_NAME: + return PM_STRING_QUERY_FALSE; + case PM_SLICE_TYPE_LOCAL: + return PM_STRING_QUERY_TRUE; + } + + assert(false && "unreachable"); + return PM_STRING_QUERY_FALSE; +} + +/** + * Check that the slice is a valid constant name. + */ +pm_string_query_t +pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name) { + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + case PM_SLICE_TYPE_LOCAL: + case PM_SLICE_TYPE_METHOD_NAME: + return PM_STRING_QUERY_FALSE; + case PM_SLICE_TYPE_CONSTANT: + return PM_STRING_QUERY_TRUE; + } + + assert(false && "unreachable"); + return PM_STRING_QUERY_FALSE; +} + +/** + * Check that the slice is a valid method name. + */ +pm_string_query_t +pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name) { +#define B(p) ((p) ? PM_STRING_QUERY_TRUE : PM_STRING_QUERY_FALSE) +#define C1(c) (*source == c) +#define C2(s) (memcmp(source, s, 2) == 0) +#define C3(s) (memcmp(source, s, 3) == 0) + + switch (pm_slice_type(source, length, encoding_name)) { + case PM_SLICE_TYPE_ERROR: + return PM_STRING_QUERY_ERROR; + case PM_SLICE_TYPE_NONE: + break; + case PM_SLICE_TYPE_LOCAL: + // numbered parameters are not valid method names + return B((length != 2) || (source[0] != '_') || (source[1] == '0') || !pm_char_is_decimal_digit(source[1])); + case PM_SLICE_TYPE_CONSTANT: + // all constants are valid method names + case PM_SLICE_TYPE_METHOD_NAME: + // all method names are valid method names + return PM_STRING_QUERY_TRUE; + } + + switch (length) { + case 1: + return B(C1('&') || C1('`') || C1('!') || C1('^') || C1('>') || C1('<') || C1('-') || C1('%') || C1('|') || C1('+') || C1('/') || C1('*') || C1('~')); + case 2: + return B(C2("!=") || C2("!~") || C2("[]") || C2("==") || C2("=~") || C2(">=") || C2(">>") || C2("<=") || C2("<<") || C2("**")); + case 3: + return B(C3("===") || C3("<=>") || C3("[]=")); + default: + return PM_STRING_QUERY_FALSE; + } + +#undef B +#undef C1 +#undef C2 +#undef C3 +} diff --git a/prism/string_query.h b/prism/string_query.h new file mode 100644 index 0000000000..6ee1a9d9b6 --- /dev/null +++ b/prism/string_query.h @@ -0,0 +1,63 @@ +/** + * @file string_query.h + * + * Functions for querying properties of strings, such as whether they are valid + * local variable names, constant names, or method names. + */ +#ifndef PRISM_STRING_QUERY_H +#define PRISM_STRING_QUERY_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> +#include <stdint.h> + +/** + * Represents the results of a slice query. + */ +typedef enum { + /** Returned if the encoding given to a slice query was invalid. */ + PM_STRING_QUERY_ERROR = -1, + + /** Returned if the result of the slice query is false. */ + PM_STRING_QUERY_FALSE, + + /** Returned if the result of the slice query is true. */ + PM_STRING_QUERY_TRUE +} pm_string_query_t; + +/** + * Check that the slice is a valid local variable name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @returns PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_local(const uint8_t *source, size_t length, const char *encoding_name) PRISM_NONNULL(1, 3); + +/** + * Check that the slice is a valid constant name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @returns PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_constant(const uint8_t *source, size_t length, const char *encoding_name) PRISM_NONNULL(1, 3); + +/** + * Check that the slice is a valid method name. + * + * @param source The source to check. + * @param length The length of the source. + * @param encoding_name The name of the encoding of the source. + * @returns PM_STRING_QUERY_TRUE if the query is true, PM_STRING_QUERY_FALSE if + * the query is false, and PM_STRING_QUERY_ERROR if the encoding was invalid. + */ +PRISM_EXPORTED_FUNCTION pm_string_query_t pm_string_query_method_name(const uint8_t *source, size_t length, const char *encoding_name) PRISM_NONNULL(1, 3); + +#endif diff --git a/prism/stringy.c b/prism/stringy.c new file mode 100644 index 0000000000..d6f4c4a777 --- /dev/null +++ b/prism/stringy.c @@ -0,0 +1,91 @@ +#include "prism/internal/stringy.h" + +#include "prism/internal/allocator.h" + +#include <assert.h> +#include <stdlib.h> +#include <string.h> + +/** + * Initialize a shared string that is based on initial input. + */ +void +pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end) { + assert(start <= end); + + *string = (pm_string_t) { + .type = PM_STRING_SHARED, + .source = start, + .length = (size_t) (end - start) + }; +} + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + */ +void +pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_OWNED, + .source = source, + .length = length + }; +} + +/** + * Initialize a constant string that doesn't own its memory source. + */ +void +pm_string_constant_init(pm_string_t *string, const char *source, size_t length) { + *string = (pm_string_t) { + .type = PM_STRING_CONSTANT, + .source = (const uint8_t *) source, + .length = length + }; +} + +/** + * Compare the underlying lengths and bytes of two strings. Returns 0 if the + * strings are equal, a negative number if the left string is less than the + * right string, and a positive number if the left string is greater than the + * right string. + */ +int +pm_string_compare(const pm_string_t *left, const pm_string_t *right) { + size_t left_length = pm_string_length(left); + size_t right_length = pm_string_length(right); + + if (left_length < right_length) { + return -1; + } else if (left_length > right_length) { + return 1; + } + + return memcmp(pm_string_source(left), pm_string_source(right), left_length); +} + +/** + * Returns the length associated with the string. + */ +size_t +pm_string_length(const pm_string_t *string) { + return string->length; +} + +/** + * Returns the start pointer associated with the string. + */ +const uint8_t * +pm_string_source(const pm_string_t *string) { + return string->source; +} + +/** + * Free the associated memory of the given string. + */ +void +pm_string_cleanup(pm_string_t *string) { + if (string->type == PM_STRING_OWNED) { + xfree_sized((void *) string->source, string->length); + } +} diff --git a/prism/stringy.h b/prism/stringy.h new file mode 100644 index 0000000000..0d64387ac3 --- /dev/null +++ b/prism/stringy.h @@ -0,0 +1,72 @@ +/** + * @file stringy.h + * + * A generic string type that can have various ownership semantics. + */ +#ifndef PRISM_STRINGY_H +#define PRISM_STRINGY_H + +#include "prism/compiler/exported.h" +#include "prism/compiler/nonnull.h" + +#include <stddef.h> +#include <stdint.h> + +/** + * A generic string type that can have various ownership semantics. + */ +typedef struct { + /** A pointer to the start of the string. */ + const uint8_t *source; + + /** The length of the string in bytes of memory. */ + size_t length; + + /** The type of the string. This field determines how the string should be freed. */ + enum { + /** This string is a constant string, and should not be freed. */ + PM_STRING_CONSTANT, + + /** This is a slice of another string, and should not be freed. */ + PM_STRING_SHARED, + + /** This string owns its memory, and should be freed internally. */ + PM_STRING_OWNED + } type; +} pm_string_t; + +/** + * Initialize a constant string that doesn't own its memory source. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +PRISM_EXPORTED_FUNCTION void pm_string_constant_init(pm_string_t *string, const char *source, size_t length) PRISM_NONNULL(1); + +/** + * Initialize an owned string that is responsible for freeing allocated memory. + * + * @param string The string to initialize. + * @param source The source of the string. + * @param length The length of the string. + */ +PRISM_EXPORTED_FUNCTION void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) PRISM_NONNULL(1, 2); + +/** + * Returns the length associated with the string. + * + * @param string The string to get the length of. + * @returns The length of the string. + */ +PRISM_EXPORTED_FUNCTION size_t pm_string_length(const pm_string_t *string) PRISM_NONNULL(1); + +/** + * Returns the start pointer associated with the string. + * + * @param string The string to get the start pointer of. + * @returns The start pointer of the string. + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_string_source(const pm_string_t *string) PRISM_NONNULL(1); + +#endif diff --git a/prism/util/pm_strncasecmp.c b/prism/strncasecmp.c index 3f58421554..a373cad6d7 100644 --- a/prism/util/pm_strncasecmp.c +++ b/prism/strncasecmp.c @@ -1,11 +1,12 @@ -#include "prism/util/pm_strncasecmp.h" +#include "prism/internal/strncasecmp.h" + +#include "prism/compiler/inline.h" /** * A locale-insensitive version of `tolower(3)` */ -static inline int -pm_tolower(int c) -{ +static PRISM_INLINE int +pm_tolower(int c) { if ('A' <= c && c <= 'Z') { return c | 0x20; } diff --git a/prism/strpbrk.c b/prism/strpbrk.c new file mode 100644 index 0000000000..383707eb72 --- /dev/null +++ b/prism/strpbrk.c @@ -0,0 +1,439 @@ +#include "prism/internal/strpbrk.h" + +#include "prism/compiler/accel.h" +#include "prism/compiler/inline.h" +#include "prism/compiler/unused.h" + +#include "prism/internal/bit.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/parser.h" + +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +/** + * Add an invalid multibyte character error to the parser. + */ +static PRISM_INLINE void +pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, uint32_t start, uint32_t length) { + pm_diagnostic_list_append_format(&parser->metadata_arena, &parser->error_list, start, length, PM_ERR_INVALID_MULTIBYTE_CHARACTER, parser->start[start]); +} + +/** + * Set the explicit encoding for the parser to the current encoding. + */ +static PRISM_INLINE void +pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, uint32_t start, uint32_t length) { + if (parser->explicit_encoding != NULL) { + if (parser->explicit_encoding == parser->encoding) { + // Okay, we already locked to this encoding. + } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // Not okay, we already found a Unicode escape sequence and this + // conflicts. + pm_diagnostic_list_append_format(&parser->metadata_arena, &parser->error_list, start, length, PM_ERR_MIXED_ENCODING, parser->encoding->name); + } else { + // Should not be anything else. + assert(false && "unreachable"); + } + } + + parser->explicit_encoding = parser->encoding; +} + +/** + * Scan forward through ASCII bytes looking for a byte that is in the given + * charset. Returns true if a match was found, storing its offset in *index. + * Returns false if no match was found, storing the number of ASCII bytes + * consumed in *index (so the caller can skip past them). + * + * All charset characters must be ASCII (< 0x80). The scanner stops at non-ASCII + * bytes, returning control to the caller's encoding-aware loop. + * + * Up to three optimized implementations are selected at compile time, with a + * no-op fallback for unsupported platforms: + * 1. NEON — processes 16 bytes per iteration on aarch64. + * 2. SSSE3 — processes 16 bytes per iteration on x86-64. + * 3. SWAR — little-endian fallback, processes 8 bytes per iteration. + */ + +#if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR) + +/** + * Update the cached strpbrk lookup tables if the charset has changed. The + * parser caches the last charset's precomputed tables so that repeated calls + * with the same breakpoints (the common case during string/regex/list lexing) + * skip table construction entirely. + * + * Builds three structures: + * - low_lut/high_lut: nibble-based lookup tables for SIMD matching (NEON/SSSE3) + * - table: 256-bit bitmap for scalar fallback matching (all platforms) + */ +static PRISM_INLINE void +pm_strpbrk_cache_update(pm_parser_t *parser, const uint8_t *charset) { + // The cache key is the full charset buffer (PM_STRPBRK_CACHE_SIZE bytes). + // Since it is always NUL-padded, a fixed-size comparison covers both + // content and length. + if (memcmp(parser->strpbrk_cache.charset, charset, sizeof(parser->strpbrk_cache.charset)) == 0) return; + + memset(parser->strpbrk_cache.low_lut, 0, sizeof(parser->strpbrk_cache.low_lut)); + memset(parser->strpbrk_cache.high_lut, 0, sizeof(parser->strpbrk_cache.high_lut)); + memset(parser->strpbrk_cache.table, 0, sizeof(parser->strpbrk_cache.table)); + + // Always include NUL in the tables. The slow path uses strchr, which + // always matches NUL (it finds the C string terminator), so NUL is + // effectively always a breakpoint. Replicating that here lets the fast + // scanner handle NUL at full speed instead of bailing to the slow path. + parser->strpbrk_cache.low_lut[0x00] |= (uint8_t) (1 << 0); + parser->strpbrk_cache.high_lut[0x00] = (uint8_t) (1 << 0); + parser->strpbrk_cache.table[0] |= (uint64_t) 1; + + size_t charset_len = 0; + for (const uint8_t *c = charset; *c != '\0'; c++) { + parser->strpbrk_cache.low_lut[*c & 0x0F] |= (uint8_t) (1 << (*c >> 4)); + parser->strpbrk_cache.high_lut[*c >> 4] = (uint8_t) (1 << (*c >> 4)); + parser->strpbrk_cache.table[*c >> 6] |= (uint64_t) 1 << (*c & 0x3F); + charset_len++; + } + + // Store the new charset key, NUL-padded to the full buffer size. + memcpy(parser->strpbrk_cache.charset, charset, charset_len + 1); + memset(parser->strpbrk_cache.charset + charset_len + 1, 0, sizeof(parser->strpbrk_cache.charset) - charset_len - 1); +} + +#endif + +#if defined(PRISM_HAS_NEON) +#include <arm_neon.h> + +static PRISM_INLINE bool +scan_strpbrk_ascii(pm_parser_t *parser, const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) { + pm_strpbrk_cache_update(parser, charset); + + uint8x16_t low_lut = vld1q_u8(parser->strpbrk_cache.low_lut); + uint8x16_t high_lut = vld1q_u8(parser->strpbrk_cache.high_lut); + uint8x16_t mask_0f = vdupq_n_u8(0x0F); + uint8x16_t mask_80 = vdupq_n_u8(0x80); + + size_t idx = 0; + + while (idx + 16 <= maximum) { + uint8x16_t v = vld1q_u8(source + idx); + + // If any byte has the high bit set, we have non-ASCII data. + // Return to let the caller's encoding-aware loop handle it. + if (vmaxvq_u8(vandq_u8(v, mask_80)) != 0) break; + + uint8x16_t lo_class = vqtbl1q_u8(low_lut, vandq_u8(v, mask_0f)); + uint8x16_t hi_class = vqtbl1q_u8(high_lut, vshrq_n_u8(v, 4)); + uint8x16_t matched = vtstq_u8(lo_class, hi_class); + + if (vmaxvq_u8(matched) == 0) { + idx += 16; + continue; + } + + // Find the position of the first matching byte. + uint64_t lo64 = vgetq_lane_u64(vreinterpretq_u64_u8(matched), 0); + if (lo64 != 0) { + *index = idx + pm_ctzll(lo64) / 8; + return true; + } + uint64_t hi64 = vgetq_lane_u64(vreinterpretq_u64_u8(matched), 1); + *index = idx + 8 + pm_ctzll(hi64) / 8; + return true; + } + + // Scalar tail for remaining < 16 ASCII bytes. + while (idx < maximum && source[idx] < 0x80) { + uint8_t byte = source[idx]; + if (parser->strpbrk_cache.table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) { + *index = idx; + return true; + } + idx++; + } + + *index = idx; + return false; +} + +#elif defined(PRISM_HAS_SSSE3) +#include <tmmintrin.h> + +static PRISM_INLINE bool +scan_strpbrk_ascii(pm_parser_t *parser, const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) { + pm_strpbrk_cache_update(parser, charset); + + __m128i low_lut = _mm_loadu_si128((const __m128i *) parser->strpbrk_cache.low_lut); + __m128i high_lut = _mm_loadu_si128((const __m128i *) parser->strpbrk_cache.high_lut); + __m128i mask_0f = _mm_set1_epi8(0x0F); + + size_t idx = 0; + + while (idx + 16 <= maximum) { + __m128i v = _mm_loadu_si128((const __m128i *) (source + idx)); + + // If any byte has the high bit set, stop. + if (_mm_movemask_epi8(v) != 0) break; + + // Nibble-based classification using pshufb (SSSE3), same as NEON + // vqtbl1q_u8. A byte matches iff (low_lut[lo_nib] & high_lut[hi_nib]) != 0. + __m128i lo_class = _mm_shuffle_epi8(low_lut, _mm_and_si128(v, mask_0f)); + __m128i hi_class = _mm_shuffle_epi8(high_lut, _mm_and_si128(_mm_srli_epi16(v, 4), mask_0f)); + __m128i matched = _mm_and_si128(lo_class, hi_class); + + // Check if any byte matched. + int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(matched, _mm_setzero_si128())); + + if (mask == 0xFFFF) { + // All bytes were zero — no match in this chunk. + idx += 16; + continue; + } + + // Find the first matching byte (first non-zero in matched). + *index = idx + pm_ctzll((uint64_t) (~mask & 0xFFFF)); + return true; + } + + // Scalar tail. + while (idx < maximum && source[idx] < 0x80) { + uint8_t byte = source[idx]; + if (parser->strpbrk_cache.table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) { + *index = idx; + return true; + } + idx++; + } + + *index = idx; + return false; +} + +#elif defined(PRISM_HAS_SWAR) + +static PRISM_INLINE bool +scan_strpbrk_ascii(pm_parser_t *parser, const uint8_t *source, size_t maximum, const uint8_t *charset, size_t *index) { + pm_strpbrk_cache_update(parser, charset); + + static const uint64_t highs = 0x8080808080808080ULL; + size_t idx = 0; + + while (idx + 8 <= maximum) { + uint64_t word; + memcpy(&word, source + idx, 8); + + // Bail on any non-ASCII byte. + if (word & highs) break; + + // Check each byte against the charset table. + for (size_t j = 0; j < 8; j++) { + uint8_t byte = source[idx + j]; + if (parser->strpbrk_cache.table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) { + *index = idx + j; + return true; + } + } + + idx += 8; + } + + // Scalar tail. + while (idx < maximum && source[idx] < 0x80) { + uint8_t byte = source[idx]; + if (parser->strpbrk_cache.table[byte >> 6] & ((uint64_t) 1 << (byte & 0x3F))) { + *index = idx; + return true; + } + idx++; + } + + *index = idx; + return false; +} + +#else + +static PRISM_INLINE bool +scan_strpbrk_ascii(PRISM_UNUSED pm_parser_t *parser, PRISM_UNUSED const uint8_t *source, PRISM_UNUSED size_t maximum, PRISM_UNUSED const uint8_t *charset, size_t *index) { + *index = 0; + return false; +} + +#endif + +/** + * This is the default path. + */ +static PRISM_INLINE const uint8_t * +pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) { + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, (uint32_t) ((source + start) - parser->start), (uint32_t) (index - start)); + } + } + } + + return NULL; +} + +/** + * This is the path when the encoding is ASCII-8BIT. + */ +static PRISM_INLINE const uint8_t * +pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) { + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, (uint32_t) (source - parser->start), 1); + index++; + } + + return NULL; +} + +/** + * This is the slow path that does care about the encoding. + */ +static PRISM_INLINE const uint8_t * +pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) { + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + if (validate) pm_strpbrk_explicit_encoding_set(parser, (uint32_t) (source - parser->start), (uint32_t) width); + + if (width > 0) { + index += width; + } else if (!validate) { + index++; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, (uint32_t) ((source + start) - parser->start), (uint32_t) (index - start)); + } + } + } + + return NULL; +} + +/** + * This is the fast path that does not care about the encoding because we know + * the encoding only supports single-byte characters. + */ +static PRISM_INLINE const uint8_t * +pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t index, size_t maximum, bool validate) { + const pm_encoding_t *encoding = parser->encoding; + + while (index < maximum) { + if (strchr((const char *) charset, source[index]) != NULL) { + return source + index; + } + + if (source[index] < 0x80 || !validate) { + index++; + } else { + size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); + pm_strpbrk_explicit_encoding_set(parser, (uint32_t) (source - parser->start), (uint32_t) width); + + if (width > 0) { + index += width; + } else { + // At this point we know we have an invalid multibyte character. + // We'll walk forward as far as we can until we find the next + // valid character so that we don't spam the user with a ton of + // the same kind of error. + const size_t start = index; + + do { + index++; + } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); + + pm_strpbrk_invalid_multibyte_character(parser, (uint32_t) ((source + start) - parser->start), (uint32_t) (index - start)); + } + } + } + + return NULL; +} + +/** + * Here we have rolled our own version of strpbrk. The standard library strpbrk + * has undefined behavior when the source string is not null-terminated. We want + * to support strings that are not null-terminated because pm_parse does not + * have the contract that the string is null-terminated. (This is desirable + * because it means the extension can call pm_parse with the result of a call to + * mmap). + * + * The standard library strpbrk also does not support passing a maximum length + * to search. We want to support this for the reason mentioned above, but we + * also don't want it to stop on null bytes. Ruby actually allows null bytes + * within strings, comments, regular expressions, etc. So we need to be able to + * skip past them. + * + * Finally, we want to support encodings wherein the charset could contain + * characters that are trailing bytes of multi-byte characters. For example, in + * Shift_JIS, the backslash character can be a trailing byte. In that case we + * need to take a slower path and iterate one multi-byte character at a time. + */ +const uint8_t * +pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) { + if (length <= 0) return NULL; + + size_t maximum = (size_t) length; + size_t index = 0; + if (scan_strpbrk_ascii(parser, source, maximum, charset, &index)) return source + index; + + if (!parser->encoding_changed) { + return pm_strpbrk_utf8(parser, source, charset, index, maximum, validate); + } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + return pm_strpbrk_ascii_8bit(parser, source, charset, index, maximum, validate); + } else if (parser->encoding->multibyte) { + return pm_strpbrk_multi_byte(parser, source, charset, index, maximum, validate); + } else { + return pm_strpbrk_single_byte(parser, source, charset, index, maximum, validate); + } +} diff --git a/prism/templates/ext/prism/api_node.c.erb b/prism/templates/ext/prism/api_node.c.erb index 23af8886a7..41d7165930 100644 --- a/prism/templates/ext/prism/api_node.c.erb +++ b/prism/templates/ext/prism/api_node.c.erb @@ -1,5 +1,9 @@ #line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" #include "prism/extension.h" +#include "prism/internal/allocator.h" +#include "prism/internal/arena.h" + +#include <assert.h> extern VALUE rb_cPrism; extern VALUE rb_cPrismNode; @@ -12,25 +16,20 @@ static VALUE rb_cPrism<%= node.name %>; <%- end -%> static VALUE -pm_location_new(const pm_parser_t *parser, const uint8_t *start, const uint8_t *end, VALUE source, bool freeze) { +pm_location_new(const uint32_t start, const uint32_t length, VALUE source, bool freeze) { if (freeze) { - VALUE location_argv[] = { - source, - LONG2FIX(start - parser->start), - LONG2FIX(end - start) - }; - + VALUE location_argv[] = { source, LONG2FIX(start), LONG2FIX(length) }; return rb_obj_freeze(rb_class_new_instance(3, location_argv, rb_cPrismLocation)); } else { - uint64_t value = ((((uint64_t) (start - parser->start)) << 32) | ((uint32_t) (end - start))); + uint64_t value = ((((uint64_t) start) << 32) | ((uint64_t) length)); return ULL2NUM(value); } } VALUE pm_token_new(const pm_parser_t *parser, const pm_token_t *token, rb_encoding *encoding, VALUE source, bool freeze) { - ID type = rb_intern(pm_token_type_name(token->type)); - VALUE location = pm_location_new(parser, token->start, token->end, source, freeze); + ID type = rb_intern(pm_token_type(token->type)); + VALUE location = pm_location_new((uint32_t) (token->start - pm_parser_start(parser)), (uint32_t) (token->end - token->start), source, freeze); VALUE slice = rb_enc_str_new((const char *) token->start, token->end - token->start, encoding); if (freeze) rb_obj_freeze(slice); @@ -79,19 +78,25 @@ pm_integer_new(const pm_integer_t *integer) { // Create a Prism::Source object from the given parser, after pm_parse() was called. VALUE pm_source_new(const pm_parser_t *parser, rb_encoding *encoding, bool freeze) { - VALUE source_string = rb_enc_str_new((const char *) parser->start, parser->end - parser->start, encoding); + const uint8_t *start = pm_parser_start(parser); + VALUE source_string = rb_enc_str_new((const char *) start, pm_parser_end(parser) - start, encoding); - VALUE offsets = rb_ary_new_capa(parser->newline_list.size); - for (size_t index = 0; index < parser->newline_list.size; index++) { - rb_ary_push(offsets, ULONG2NUM(parser->newline_list.offsets[index])); - } + const pm_line_offset_list_t *line_offsets = pm_parser_line_offsets(parser); + VALUE offsets; if (freeze) { + offsets = rb_ary_new_capa(line_offsets->size); + for (size_t index = 0; index < line_offsets->size; index++) { + rb_ary_push(offsets, ULONG2NUM(line_offsets->offsets[index])); + } + rb_obj_freeze(source_string); rb_obj_freeze(offsets); + } else { + offsets = rb_str_new((const char *) line_offsets->offsets, line_offsets->size * sizeof(uint32_t)); } - VALUE source = rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(parser->start_line), offsets); + VALUE source = rb_funcall(rb_cPrismSource, rb_intern("for"), 3, source_string, LONG2NUM(pm_parser_start_line(parser)), offsets); if (freeze) rb_obj_freeze(source); return source; @@ -104,8 +109,8 @@ typedef struct pm_node_stack_node { } pm_node_stack_node_t; static void -pm_node_stack_push(pm_node_stack_node_t **stack, const pm_node_t *visit) { - pm_node_stack_node_t *node = xmalloc(sizeof(pm_node_stack_node_t)); +pm_node_stack_push(pm_arena_t *arena, pm_node_stack_node_t **stack, const pm_node_t *visit) { + pm_node_stack_node_t *node = (pm_node_stack_node_t *) pm_arena_alloc(arena, sizeof(pm_node_stack_node_t), PRISM_ALIGNOF(pm_node_stack_node_t)); node->prev = *stack; node->visit = visit; node->visited = false; @@ -118,32 +123,40 @@ pm_node_stack_pop(pm_node_stack_node_t **stack) { const pm_node_t *visit = current->visit; *stack = current->prev; - xfree(current); return visit; } -VALUE -pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encoding, VALUE source, bool freeze) { - VALUE constants = rb_ary_new_capa(parser->constant_pool.size); - - for (uint32_t index = 0; index < parser->constant_pool.size; index++) { - pm_constant_t *constant = &parser->constant_pool.constants[index]; - int state = 0; +typedef struct { + VALUE constants; + rb_encoding *encoding; +} pm_ast_constants_each_data_t; - VALUE string = rb_enc_str_new((const char *) constant->start, constant->length, encoding); - VALUE value = rb_protect(rb_str_intern, string, &state); +static void +pm_ast_constants_each(const pm_constant_t *constant, void *data) { + pm_ast_constants_each_data_t *constants_data = (pm_ast_constants_each_data_t *) data; + int state = 0; - if (state != 0) { - value = ID2SYM(rb_intern_const("?")); - rb_set_errinfo(Qnil); - } + VALUE string = rb_enc_str_new((const char *) pm_constant_start(constant), pm_constant_length(constant), constants_data->encoding); + VALUE value = rb_protect(rb_str_intern, string, &state); - rb_ary_push(constants, value); + if (state != 0) { + value = ID2SYM(rb_intern_const("?")); + rb_set_errinfo(Qnil); } + rb_ary_push(constants_data->constants, value); +} + +VALUE +pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encoding, VALUE source, bool freeze) { + VALUE constants = rb_ary_new_capa(pm_parser_constants_size(parser)); + pm_ast_constants_each_data_t constants_data = { .constants = constants, .encoding = encoding }; + pm_parser_constants_each(parser, pm_ast_constants_each, &constants_data); + + pm_arena_t *node_arena = pm_arena_new(); pm_node_stack_node_t *node_stack = NULL; - pm_node_stack_push(&node_stack, node); + pm_node_stack_push(node_arena, &node_stack, node); VALUE value_stack = rb_ary_new(); while (node_stack != NULL) { @@ -166,10 +179,10 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi <%- node.fields.each do |field| -%> <%- case field -%> <%- when Prism::Template::NodeField, Prism::Template::OptionalNodeField -%> - pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>); + pm_node_stack_push(node_arena, &node_stack, (pm_node_t *) cast-><%= field.name %>); <%- when Prism::Template::NodeListField -%> for (size_t index = 0; index < cast-><%= field.name %>.size; index++) { - pm_node_stack_push(&node_stack, (pm_node_t *) cast-><%= field.name %>.nodes[index]); + pm_node_stack_push(node_arena, &node_stack, (pm_node_t *) cast-><%= field.name %>.nodes[index]); } <%- end -%> <%- end -%> @@ -200,7 +213,7 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi argv[1] = ULONG2NUM(node->node_id); // location - argv[2] = pm_location_new(parser, node->location.start, node->location.end, source, freeze); + argv[2] = pm_location_new(node->location.start, node->location.length, source, freeze); // flags argv[3] = ULONG2NUM(node->flags); @@ -237,10 +250,10 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi if (freeze) rb_obj_freeze(argv[<%= index %>]); <%- when Prism::Template::LocationField -%> #line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" - argv[<%= index %>] = pm_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source, freeze); + argv[<%= index %>] = pm_location_new(cast-><%= field.name %>.start, cast-><%= field.name %>.length, source, freeze); <%- when Prism::Template::OptionalLocationField -%> #line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" - argv[<%= index %>] = cast-><%= field.name %>.start == NULL ? Qnil : pm_location_new(parser, cast-><%= field.name %>.start, cast-><%= field.name %>.end, source, freeze); + argv[<%= index %>] = cast-><%= field.name %>.length == 0 ? Qnil : pm_location_new(cast-><%= field.name %>.start, cast-><%= field.name %>.length, source, freeze); <%- when Prism::Template::UInt8Field -%> #line <%= __LINE__ + 1 %> "prism/templates/ext/prism/<%= File.basename(__FILE__) %>" argv[<%= index %>] = UINT2NUM(cast-><%= field.name %>); @@ -271,6 +284,7 @@ pm_ast_new(const pm_parser_t *parser, const pm_node_t *node, rb_encoding *encodi } } + pm_arena_free(node_arena); return rb_ary_pop(value_stack); } diff --git a/prism/templates/include/prism/ast.h.erb b/prism/templates/include/prism/ast.h.erb index 751c0b43c2..3b3be25e76 100644 --- a/prism/templates/include/prism/ast.h.erb +++ b/prism/templates/include/prism/ast.h.erb @@ -2,16 +2,20 @@ * @file ast.h * * The abstract syntax tree. + * + * -- */ #ifndef PRISM_AST_H #define PRISM_AST_H -#include "prism/defines.h" -#include "prism/util/pm_constant_pool.h" -#include "prism/util/pm_integer.h" -#include "prism/util/pm_string.h" +#include "prism/compiler/align.h" +#include "prism/compiler/exported.h" + +#include "prism/arena.h" +#include "prism/constant_pool.h" +#include "prism/integer.h" +#include "prism/stringy.h" -#include <assert.h> #include <stddef.h> #include <stdint.h> @@ -20,7 +24,7 @@ */ typedef enum pm_token_type { <%- tokens.each do |token| -%> - /** <%= token.comment %> */ + /** <%= Prism::Template::Doxygen.verbatim(token.comment) %> */ PM_TOKEN_<%= token.name %><%= " = #{token.value}" if token.value %>, <%- end -%> @@ -44,15 +48,28 @@ typedef struct { } pm_token_t; /** - * This represents a range of bytes in the source string to which a node or - * token corresponds. + * Returns a string representation of the given token type. + * + * @param token_type The type of the token to get the string representation of. + * @returns A string representation of the given token type. This is meant for + * debugging purposes and is not guaranteed to be stable across versions. + */ +PRISM_EXPORTED_FUNCTION const char * pm_token_type(pm_token_type_t token_type); + +/** + * This struct represents a slice in the source code, defined by an offset and + * a length. Note that we have confirmation that we can represent all locations + * within Ruby source files using 32-bit integers per: + * + * https://bugs.ruby-lang.org/issues/20488#note-1 + * */ typedef struct { - /** A pointer to the start location of the range in the source. */ - const uint8_t *start; + /** The offset of the location from the start of the source. */ + uint32_t start; - /** A pointer to the end location of the range in the source. */ - const uint8_t *end; + /** The length of the location. */ + uint32_t length; } pm_location_t; struct pm_node; @@ -104,29 +121,13 @@ static const pm_node_flags_t PM_NODE_FLAG_NEWLINE = 0x1; static const pm_node_flags_t PM_NODE_FLAG_STATIC_LITERAL = 0x2; /** - * Cast the type to an enum to allow the compiler to provide exhaustiveness - * checking. - */ -#define PM_NODE_TYPE(node) ((enum pm_node_type) (node)->type) - -/** - * Return true if the type of the given node matches the given type. - */ -#define PM_NODE_TYPE_P(node, type) (PM_NODE_TYPE(node) == (type)) - -/** - * Return true if the given flag is set on the given node. - */ -#define PM_NODE_FLAG_P(node, flag) ((((pm_node_t *)(node))->flags & (flag)) != 0) - -/** * This is the base structure that represents a node in the syntax tree. It is * embedded into every node type. */ typedef struct pm_node { /** * This represents the type of the node. It somewhat maps to the nodes that - * existed in the original grammar and ripper, but it's not a 1:1 mapping. + * existed in the original grammar and ripper, but it is not a 1:1 mapping. */ pm_node_type_t type; @@ -143,11 +144,46 @@ typedef struct pm_node { uint32_t node_id; /** - * This is the location of the node in the source. It's a range of bytes + * This is the location of the node in the source. It is a range of bytes * containing a start and an end. */ pm_location_t location; } pm_node_t; + +/** + * Cast the given node to the base pm_node_t type. + */ +#define PM_NODE_UPCAST(node_) ((pm_node_t *) (node_)) + +/** + * Cast the type to an enum to allow the compiler to provide exhaustiveness + * checking. + */ +#define PM_NODE_TYPE(node_) ((enum pm_node_type) (node_)->type) + +/** + * Return true if the type of the given node matches the given type. + */ +#define PM_NODE_TYPE_P(node_, type_) (PM_NODE_TYPE(node_) == (type_)) + +/** + * Return the flags associated with the given node. + */ +#define PM_NODE_FLAGS(node_) (PM_NODE_UPCAST(node_)->flags) + +/** + * Return true if the given flag is set on the given node. + */ +#define PM_NODE_FLAG_P(node_, flag_) ((PM_NODE_FLAGS(node_) & (flag_)) != 0) + +/** + * The alignment required for a child node within a parent node. + */ +#ifdef _MSC_VER +#define PM_NODE_ALIGNAS __declspec(align(8)) +#else +#define PM_NODE_ALIGNAS PRISM_ALIGNAS(PRISM_ALIGNOF(void *)) +#endif <%- nodes.each do |node| -%> /** @@ -170,7 +206,6 @@ typedef struct pm_node { typedef struct pm_<%= node.human %> { /** The embedded base node. */ pm_node_t base; - <%- node.fields.each do |field| -%> /** @@ -183,7 +218,7 @@ typedef struct pm_<%= node.human %> { <%- end -%> */ <%= case field - when Prism::Template::NodeField, Prism::Template::OptionalNodeField then "struct #{field.c_type} *#{field.name}" + when Prism::Template::NodeField, Prism::Template::OptionalNodeField then "PM_NODE_ALIGNAS struct #{field.c_type} *#{field.name}" when Prism::Template::NodeListField then "struct pm_node_list #{field.name}" when Prism::Template::ConstantField, Prism::Template::OptionalConstantField then "pm_constant_id_t #{field.name}" when Prism::Template::ConstantListField then "pm_constant_id_list_t #{field.name}" @@ -210,8 +245,27 @@ typedef enum pm_<%= flag.human %> { /** <%= value.comment %> */ PM_<%= flag.human.upcase %>_<%= value.name %> = <%= 1 << (index + Prism::Template::COMMON_FLAGS_COUNT) %>, <%- end -%> + + PM_<%= flag.human.upcase %>_LAST, } pm_<%= flag.human %>_t; <%- end -%> +<%- nodes.each do |node| -%> + +<%- params = node.fields.map(&:c_param) -%> +/** + * Allocate and initialize a new <%= node.name %> node. + * + * @param arena The arena to allocate from. + * @param node_id The unique identifier for this node. + * @param flags The flags for this node. + * @param location The location of this node in the source. +<%- node.fields.each do |field| -%> + * @param <%= field.name %> <%= field.comment ? Prism::Template::Doxygen.verbatim(field.comment.lines.first.strip) : "The #{field.name} field." %> +<%- end -%> + * @returns The newly allocated and initialized node. + */ +PRISM_EXPORTED_FUNCTION pm_<%= node.human %>_t * pm_<%= node.human %>_new(pm_arena_t *arena, uint32_t node_id, pm_node_flags_t flags, pm_location_t location<%= params.empty? ? "" : ", #{params.join(", ")}" %>); +<%- end -%> /** * When we're serializing to Java, we want to skip serializing the location diff --git a/prism/templates/include/prism/diagnostic.h.erb b/prism/templates/include/prism/diagnostic.h.erb deleted file mode 100644 index 07bbc8fae7..0000000000 --- a/prism/templates/include/prism/diagnostic.h.erb +++ /dev/null @@ -1,130 +0,0 @@ -/** - * @file diagnostic.h - * - * A list of diagnostics generated during parsing. - */ -#ifndef PRISM_DIAGNOSTIC_H -#define PRISM_DIAGNOSTIC_H - -#include "prism/ast.h" -#include "prism/defines.h" -#include "prism/util/pm_list.h" - -#include <stdbool.h> -#include <stdlib.h> -#include <assert.h> - -/** - * The diagnostic IDs of all of the diagnostics, used to communicate the types - * of errors between the parser and the user. - */ -typedef enum { - // These are the error diagnostics. - <%- errors.each do |error| -%> - PM_ERR_<%= error.name %>, - <%- end -%> - - // These are the warning diagnostics. - <%- warnings.each do |warning| -%> - PM_WARN_<%= warning.name %>, - <%- end -%> -} pm_diagnostic_id_t; - -/** - * This struct represents a diagnostic generated during parsing. - * - * @extends pm_list_node_t - */ -typedef struct { - /** The embedded base node. */ - pm_list_node_t node; - - /** The location of the diagnostic in the source. */ - pm_location_t location; - - /** The ID of the diagnostic. */ - pm_diagnostic_id_t diag_id; - - /** The message associated with the diagnostic. */ - const char *message; - - /** - * Whether or not the memory related to the message of this diagnostic is - * owned by this diagnostic. If it is, it needs to be freed when the - * diagnostic is freed. - */ - bool owned; - - /** - * The level of the diagnostic, see `pm_error_level_t` and - * `pm_warning_level_t` for possible values. - */ - uint8_t level; -} pm_diagnostic_t; - -/** - * The levels of errors generated during parsing. - */ -typedef enum { - /** For errors that should raise a syntax error. */ - PM_ERROR_LEVEL_SYNTAX = 0, - - /** For errors that should raise an argument error. */ - PM_ERROR_LEVEL_ARGUMENT = 1, - - /** For errors that should raise a load error. */ - PM_ERROR_LEVEL_LOAD = 2 -} pm_error_level_t; - -/** - * The levels of warnings generated during parsing. - */ -typedef enum { - /** For warnings which should be emitted if $VERBOSE != nil. */ - PM_WARNING_LEVEL_DEFAULT = 0, - - /** For warnings which should be emitted if $VERBOSE == true. */ - PM_WARNING_LEVEL_VERBOSE = 1 -} pm_warning_level_t; - -/** - * Get the human-readable name of the given diagnostic ID. - * - * @param diag_id The diagnostic ID. - * @return The human-readable name of the diagnostic ID. - */ -const char * pm_diagnostic_id_human(pm_diagnostic_id_t diag_id); - -/** - * Append a diagnostic to the given list of diagnostics that is using shared - * memory for its message. - * - * @param list The list to append to. - * @param start The start of the diagnostic. - * @param end The end of the diagnostic. - * @param diag_id The diagnostic ID. - * @return Whether the diagnostic was successfully appended. - */ -bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id); - -/** - * Append a diagnostic to the given list of diagnostics that is using a format - * string for its message. - * - * @param list The list to append to. - * @param start The start of the diagnostic. - * @param end The end of the diagnostic. - * @param diag_id The diagnostic ID. - * @param ... The arguments to the format string for the message. - * @return Whether the diagnostic was successfully appended. - */ -bool pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id, ...); - -/** - * Deallocate the internal state of the given diagnostic list. - * - * @param list The list to deallocate. - */ -void pm_diagnostic_list_free(pm_list_t *list); - -#endif diff --git a/prism/templates/include/prism/internal/diagnostic.h.erb b/prism/templates/include/prism/internal/diagnostic.h.erb new file mode 100644 index 0000000000..ee44ff5382 --- /dev/null +++ b/prism/templates/include/prism/internal/diagnostic.h.erb @@ -0,0 +1,60 @@ +#ifndef PRISM_INTERNAL_DIAGNOSTIC_H +#define PRISM_INTERNAL_DIAGNOSTIC_H + +#include "prism/internal/list.h" + +#include "prism/arena.h" +#include "prism/diagnostic.h" + +/* + * The diagnostic IDs of all of the diagnostics, used to communicate the types + * of errors between the parser and the user. + */ +typedef enum { + /* These are the error diagnostics. */ + <%- errors.each do |error| -%> + PM_ERR_<%= error.name %>, + <%- end -%> + + /* These are the warning diagnostics. */ + <%- warnings.each do |warning| -%> + PM_WARN_<%= warning.name %>, + <%- end -%> +} pm_diagnostic_id_t; + +/* + * This struct represents a diagnostic generated during parsing. + */ +struct pm_diagnostic_t { + /* The embedded base node. */ + pm_list_node_t node; + + /* The location of the diagnostic in the source. */ + pm_location_t location; + + /* The ID of the diagnostic. */ + pm_diagnostic_id_t diag_id; + + /* The message associated with the diagnostic. */ + const char *message; + + /* + * The level of the diagnostic, see `pm_error_level_t` and + * `pm_warning_level_t` for possible values. + */ + uint8_t level; +}; + +/* + * Append a diagnostic to the given list of diagnostics that is using shared + * memory for its message. + */ +void pm_diagnostic_list_append(pm_arena_t *arena, pm_list_t *list, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id); + +/* + * Append a diagnostic to the given list of diagnostics that is using a format + * string for its message. + */ +void pm_diagnostic_list_append_format(pm_arena_t *arena, pm_list_t *list, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id, ...); + +#endif diff --git a/prism/templates/lib/prism/compiler.rb.erb b/prism/templates/lib/prism/compiler.rb.erb index 45ed88d8de..13317cac04 100644 --- a/prism/templates/lib/prism/compiler.rb.erb +++ b/prism/templates/lib/prism/compiler.rb.erb @@ -1,3 +1,6 @@ +#-- +# rbs_inline: enabled + module Prism # A compiler is a visitor that returns the value of each node as it visits. # This is as opposed to a visitor which will only walk the tree. This can be @@ -18,24 +21,32 @@ module Prism # class Compiler < Visitor # Visit an individual node. - def visit(node) + #-- + #: (node?) -> untyped + def visit(node) # :nodoc: node&.accept(self) end # Visit a list of nodes. - def visit_all(nodes) + #-- + #: (Array[node?]) -> untyped + def visit_all(nodes) # :nodoc: nodes.map { |node| node&.accept(self) } end # Visit the child nodes of the given node. - def visit_child_nodes(node) - node.compact_child_nodes.map { |node| node.accept(self) } + #-- + #: (node) -> Array[untyped] + def visit_child_nodes(node) # :nodoc: + node.each_child_node.map { |node| node.accept(self) } end <%- nodes.each_with_index do |node, index| -%> <%= "\n" if index != 0 -%> - # Compile a <%= node.name %> node - alias visit_<%= node.human %> visit_child_nodes + #: (<%= node.name %>) -> Array[untyped] + def visit_<%= node.human %>(node) # :nodoc: + node.each_child_node.map { |node| node.accept(self) } + end <%- end -%> end end diff --git a/prism/templates/lib/prism/dispatcher.rb.erb b/prism/templates/lib/prism/dispatcher.rb.erb index 52478451c9..5991b0c904 100644 --- a/prism/templates/lib/prism/dispatcher.rb.erb +++ b/prism/templates/lib/prism/dispatcher.rb.erb @@ -1,3 +1,6 @@ +#-- +# rbs_inline: enabled + module Prism # The dispatcher class fires events for nodes that are found while walking an # AST to all registered listeners. It's useful for performing different types @@ -32,50 +35,52 @@ module Prism # dispatcher.dispatch_once(integer) # class Dispatcher < Visitor - # attr_reader listeners: Hash[Symbol, Array[Listener]] - attr_reader :listeners + # A hash mapping event names to arrays of listeners that should be notified + # when that event is fired. + attr_reader :listeners #: Hash[Symbol, Array[untyped]] # Initialize a new dispatcher. + #-- + #: () -> void def initialize @listeners = {} end # Register a listener for one or more events. - # - # def register: (Listener, *Symbol) -> void + #-- + #: (untyped, *Symbol) -> void def register(listener, *events) register_events(listener, events) end # Register all public methods of a listener that match the pattern # `on_<node_name>_(enter|leave)`. - # - # def register_public_methods: (Listener) -> void + #-- + #: (untyped) -> void def register_public_methods(listener) register_events(listener, listener.public_methods(false).grep(/\Aon_.+_(?:enter|leave)\z/)) end # Register a listener for the given events. - private def register_events(listener, events) + #-- + #: (untyped, Array[Symbol]) -> void + private def register_events(listener, events) # :nodoc: events.each { |event| (listeners[event] ||= []) << listener } end # Walks `root` dispatching events to all registered listeners. - # - # def dispatch: (Node) -> void alias dispatch visit # Dispatches a single event for `node` to all registered listeners. - # - # def dispatch_once: (Node) -> void + #-- + #: (node node) -> void def dispatch_once(node) node.accept(DispatchOnce.new(listeners)) end <%- nodes.each do |node| -%> - # Dispatch enter and leave events for <%= node.name %> nodes and continue - # walking the tree. - def visit_<%= node.human %>(node) + #: (<%= node.name %> node) -> void + def visit_<%= node.human %>(node) # :nodoc: listeners[:on_<%= node.human %>_enter]&.each { |listener| listener.on_<%= node.human %>_enter(node) } super listeners[:on_<%= node.human %>_leave]&.each { |listener| listener.on_<%= node.human %>_leave(node) } @@ -83,14 +88,17 @@ module Prism <%- end -%> class DispatchOnce < Visitor # :nodoc: - attr_reader :listeners + attr_reader :listeners #: Hash[Symbol, Array[untyped]] + #: (Hash[Symbol, Array[untyped]] listeners) -> void def initialize(listeners) @listeners = listeners end <%- nodes.each do |node| -%> # Dispatch enter and leave events for <%= node.name %> nodes. + #-- + #: (<%= node.name %> node) -> void def visit_<%= node.human %>(node) listeners[:on_<%= node.human %>_enter]&.each { |listener| listener.on_<%= node.human %>_enter(node) } listeners[:on_<%= node.human %>_leave]&.each { |listener| listener.on_<%= node.human %>_leave(node) } diff --git a/prism/templates/lib/prism/dot_visitor.rb.erb b/prism/templates/lib/prism/dot_visitor.rb.erb index e9c81e4545..88ef1e1f36 100644 --- a/prism/templates/lib/prism/dot_visitor.rb.erb +++ b/prism/templates/lib/prism/dot_visitor.rb.erb @@ -1,18 +1,26 @@ -require "cgi" +#-- +# rbs_inline: enabled + +require "cgi/escape" +require "cgi/util" unless defined?(CGI::EscapeExt) module Prism # This visitor provides the ability to call Node#to_dot, which converts a # subtree into a graphviz dot graph. class DotVisitor < Visitor class Field # :nodoc: - attr_reader :name, :value, :port + attr_reader :name #: String + attr_reader :value #: String? + attr_reader :port #: bool + #: (String name, String? value, bool port) -> void def initialize(name, value, port) @name = name @value = value @port = port end + #: () -> String def to_dot if port "<tr><td align=\"left\" colspan=\"2\" port=\"#{name}\">#{name}</td></tr>" @@ -23,17 +31,21 @@ module Prism end class Table # :nodoc: - attr_reader :name, :fields + attr_reader :name #: String + attr_reader :fields #: Array[Field] + #: (String name) -> void def initialize(name) @name = name @fields = [] end + #: (String name, ?String? value, ?port: bool) -> void def field(name, value = nil, port: false) fields << Field.new(name, value, port) end + #: () -> String def to_dot dot = <<~DOT <table border="0" cellborder="1" cellspacing="0" cellpadding="4"> @@ -49,26 +61,31 @@ module Prism end class Digraph # :nodoc: - attr_reader :nodes, :waypoints, :edges + attr_reader :nodes, :waypoints, :edges #: Array[String] + #: () -> void def initialize @nodes = [] @waypoints = [] @edges = [] end + #: (String value) -> void def node(value) nodes << value end + #: (String value) -> void def waypoint(value) waypoints << value end + #: (String value) -> void def edge(value) edges << value end + #: () -> String def to_dot <<~DOT digraph "Prism" { @@ -92,21 +109,25 @@ module Prism private_constant :Field, :Table, :Digraph # The digraph that is being built. - attr_reader :digraph + attr_reader :digraph #: Digraph # Initialize a new dot visitor. + #-- + #: () -> void def initialize @digraph = Digraph.new end # Convert this visitor into a graphviz dot graph string. + #-- + #: () -> String def to_dot digraph.to_dot end <%- nodes.each do |node| -%> - # Visit a <%= node.name %> node. - def visit_<%= node.human %>(node) + #: (<%= node.name %>) -> void + def visit_<%= node.human %>(node) # :nodoc: table = Table.new("<%= node.name %>") id = node_id(node) <%- if (node_flags = node.flags) -%> @@ -151,7 +172,7 @@ module Prism <%- end -%> <%- end -%> - digraph.nodes << <<~DOT + digraph.node(<<~DOT) #{id} [ label=<#{table.to_dot.gsub(/\n/, "\n ")}> ]; @@ -164,19 +185,25 @@ module Prism private # Generate a unique node ID for a node throughout the digraph. - def node_id(node) + #-- + #: (node) -> String + def node_id(node) # :nodoc: "Node_#{node.object_id}" end - # Inspect a location to display the start and end line and column numbers. - def location_inspect(location) + # Inspect a location to display the start and end line and columns in bytes. + #-- + #: (Location) -> String + def location_inspect(location) # :nodoc: "(#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column})" end <%- flags.each do |flag| -%> # Inspect a node that has <%= flag.human %> flags to display the flags as a # comma-separated list. - def <%= flag.human %>_inspect(node) + #-- + #: (<%= nodes.filter_map { |node| node.name if node.flags == flag }.join(" | ") %> node) -> String + def <%= flag.human %>_inspect(node) # :nodoc: flags = [] #: Array[String] <%- flag.values.each do |value| -%> flags << "<%= value.name.downcase %>" if node.<%= value.name.downcase %>? diff --git a/prism/templates/lib/prism/dsl.rb.erb b/prism/templates/lib/prism/dsl.rb.erb index e16ebb7110..be7dc6d9c1 100644 --- a/prism/templates/lib/prism/dsl.rb.erb +++ b/prism/templates/lib/prism/dsl.rb.erb @@ -1,8 +1,11 @@ +#-- +# rbs_inline: enabled + module Prism # The DSL module provides a set of methods that can be used to create prism # nodes in a more concise manner. For example, instead of writing: # - # source = Prism::Source.for("[1]") + # source = Prism::Source.for("[1]", 1, [0]) # # Prism::ArrayNode.new( # source, @@ -56,17 +59,31 @@ module Prism extend self # Create a new Source object. + #-- + #: (String string) -> Source def source(string) - Source.for(string) + Source.for(string, 1, build_offsets(string)) end # Create a new Location object. + #-- + #: (?source: Source, ?start_offset: Integer, ?length: Integer) -> Location def location(source: default_source, start_offset: 0, length: 0) Location.new(source, start_offset, length) end <%- nodes.each do |node| -%> + <%- + params = [ + ["source", "Source"], + ["node_id", "Integer"], + ["location", "Location"], + ["flags", "Integer"] + ].concat(node.fields.map { |field| [field.name, field.rbs_class] }) + -%> # Create a new <%= node.name %> node. + #-- + #: (<%= params.map { |(name, type)| "?#{name}: #{type}" }.join(", ") %>) -> <%= node.name %> def <%= node.human %>(<%= ["source: default_source", "node_id: 0", "location: default_location", "flags: 0", *node.fields.map { |field| case field when Prism::Template::NodeField @@ -100,6 +117,8 @@ module Prism <%- flags.each do |flag| -%> # Retrieve the value of one of the <%= flag.name %> flags. + #-- + #: (Symbol name) -> Integer def <%= flag.human.chomp("s") %>(name) case name <%- flag.values.each do |value| -%> @@ -114,20 +133,40 @@ module Prism # The default source object that gets attached to nodes and locations if no # source is specified. + #-- + #: () -> Source def default_source - Source.for("") + Source.for("", 1, [0]) end # The default location object that gets attached to nodes if no location is # specified, which uses the given source. + #-- + #: () -> Location def default_location Location.new(default_source, 0, 0) end # The default node that gets attached to nodes if no node is specified for a # required node field. + #-- + #: (Source source, Location location) -> node def default_node(source, location) - MissingNode.new(source, -1, location, 0) + ErrorRecoveryNode.new(source, -1, location, 0, nil) + end + + private + + # Build the newline byte offset array for the given source string. + #-- + #: (String source) -> Array[Integer] + def build_offsets(source) + offsets = [0] + start = 0 + while (index = source.byteindex("\n", start)) + offsets << (start = index + 1) + end + offsets end end end diff --git a/prism/templates/lib/prism/inspect_visitor.rb.erb b/prism/templates/lib/prism/inspect_visitor.rb.erb index 3cfe615d85..820f5ae75f 100644 --- a/prism/templates/lib/prism/inspect_visitor.rb.erb +++ b/prism/templates/lib/prism/inspect_visitor.rb.erb @@ -1,3 +1,6 @@ +#-- +# rbs_inline: enabled + module Prism # This visitor is responsible for composing the strings that get returned by # the various #inspect methods defined on each of the nodes. @@ -7,8 +10,9 @@ module Prism # when we hit an element in that list. In this case, we have a special # command that replaces the subsequent indent with the given value. class Replace # :nodoc: - attr_reader :value + attr_reader :value #: String + #: (String value) -> void def initialize(value) @value = value end @@ -17,19 +21,25 @@ module Prism private_constant :Replace # The current prefix string. - attr_reader :indent + # :stopdoc: + attr_reader :indent #: String + # :startdoc: # The list of commands that we need to execute in order to compose the # final string. - attr_reader :commands + #: stopdoc: + attr_reader :commands #: Array[[String | node | Replace, String]] + # :startdoc: - # Initializes a new instance of the InspectVisitor. - def initialize(indent = +"") + #: (?String indent) -> void + def initialize(indent = +"") # :nodoc: @indent = indent @commands = [] end # Compose an inspect string for the given node. + #-- + #: (node node) -> String def self.compose(node) visitor = new node.accept(visitor) @@ -37,7 +47,9 @@ module Prism end # Compose the final string. - def compose + #-- + #: () -> String + def compose # :nodoc: buffer = +"" replace = nil @@ -66,8 +78,8 @@ module Prism end <%- nodes.each do |node| -%> - # Inspect a <%= node.name %> node. - def visit_<%= node.human %>(node) + #: (<%= node.name %> node) -> void + def visit_<%= node.human %>(node) # :nodoc: commands << [inspect_node(<%= node.name.inspect %>, node), indent] <%- (fields = [node.flags || Prism::Template::Flags.empty, *node.fields]).each_with_index do |field, index| -%> <%- pointer = index == fields.length - 1 ? "└── " : "├── " -%> @@ -114,13 +126,17 @@ module Prism private # Compose a header for the given node. - def inspect_node(name, node) + #-- + #: (String name, node node) -> String + def inspect_node(name, node) # :nodoc: location = node.location "@ #{name} (location: (#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column}))\n" end # Compose a string representing the given inner location field. - def inspect_location(location) + #-- + #: (Location? location) -> String + def inspect_location(location) # :nodoc: if location "(#{location.start_line},#{location.start_column})-(#{location.end_line},#{location.end_column}) = #{location.slice.inspect}" else diff --git a/prism/templates/lib/prism/mutation_compiler.rb.erb b/prism/templates/lib/prism/mutation_compiler.rb.erb index 565ee4e315..2d555048d2 100644 --- a/prism/templates/lib/prism/mutation_compiler.rb.erb +++ b/prism/templates/lib/prism/mutation_compiler.rb.erb @@ -1,3 +1,6 @@ +#-- +# rbs_inline: enabled + module Prism # This visitor walks through the tree and copies each node as it is being # visited. This is useful for consumers that want to mutate the tree, as you @@ -5,8 +8,8 @@ module Prism class MutationCompiler < Compiler <%- nodes.each_with_index do |node, index| -%> <%= "\n" if index != 0 -%> - # Copy a <%= node.name %> node - def visit_<%= node.human %>(node) + #: (<%= node.name %>) -> node? + def visit_<%= node.human %>(node) # :nodoc: <%- fields = node.fields.select { |field| [Prism::Template::NodeField, Prism::Template::OptionalNodeField, Prism::Template::NodeListField].include?(field.class) } -%> <%- if fields.any? -%> node.copy(<%= fields.map { |field| "#{field.name}: #{field.is_a?(Prism::Template::NodeListField) ? "visit_all" : "visit"}(node.#{field.name})" }.join(", ") %>) diff --git a/prism/templates/lib/prism/node.rb.erb b/prism/templates/lib/prism/node.rb.erb index ceee2b0ffe..fb13051aba 100644 --- a/prism/templates/lib/prism/node.rb.erb +++ b/prism/templates/lib/prism/node.rb.erb @@ -1,24 +1,49 @@ +#-- +# rbs_inline: enabled + module Prism + # @rbs! + # interface _Repository + # def enter: (Integer node_id, Symbol field_name) -> Relocation::Entry + # end + # + # interface _Node + # def deconstruct: () -> Array[Prism::node?] + # def inspect: () -> String + # end + # + # type node = Node & _Node + # This represents a node in the tree. It is the parent class of all of the # various node types. class Node # A pointer to the source that this node was created from. - attr_reader :source + # :stopdoc: + attr_reader :source #: Source private :source + # :startdoc: # A unique identifier for this node. This is used in a very specific # use case where you want to keep around a reference to a node without # having to keep around the syntax tree in memory. This unique identifier # will be consistent across multiple parses of the same source code. - attr_reader :node_id + attr_reader :node_id #: Integer + + # The location associated with this node. For lazily loading Location + # objects, we keep it as a packed integer until it is accessed. + # @rbs @location: Location | Integer # Save this node using a saved source so that it can be retrieved later. + #-- + #: (_Repository repository) -> Relocation::Entry def save(repository) repository.enter(node_id, :itself) end # A Location instance that represents the location of this node in the # source. + #-- + #: () -> Location def location location = @location return location if location.is_a?(Location) @@ -26,104 +51,151 @@ module Prism end # Save the location using a saved source so that it can be retrieved later. + #-- + #: (_Repository repository) -> Relocation::Entry def save_location(repository) repository.enter(node_id, :location) end - # Delegates to the start_line of the associated location object. + # -------------------------------------------------------------------------- + # :section: Location Delegators + # These methods provide convenient access to the underlying Location object. + # -------------------------------------------------------------------------- + + # Delegates to [`start_line`](rdoc-ref:Location#start_line) of the associated location object. + #-- + #: () -> Integer def start_line location.start_line end - # Delegates to the end_line of the associated location object. + # Delegates to [`end_line`](rdoc-ref:Location#end_line) of the associated location object. + #-- + #: () -> Integer def end_line location.end_line end - # The start offset of the node in the source. This method is effectively a - # delegate method to the location object. + # Delegates to [`start_offset`](rdoc-ref:Location#start_offset) of the associated location object. + #-- + #: () -> Integer def start_offset location = @location location.is_a?(Location) ? location.start_offset : location >> 32 end - # The end offset of the node in the source. This method is effectively a - # delegate method to the location object. + # Delegates to [`end_offset`](rdoc-ref:Location#end_offset) of the associated location object. + #-- + #: () -> Integer def end_offset location = @location location.is_a?(Location) ? location.end_offset : ((location >> 32) + (location & 0xFFFFFFFF)) end - # Delegates to the start_character_offset of the associated location object. + # Delegates to [`start_character_offset`](rdoc-ref:Location#start_character_offset) + # of the associated location object. + #-- + #: () -> Integer def start_character_offset location.start_character_offset end - # Delegates to the end_character_offset of the associated location object. + # Delegates to [`end_character_offset`](rdoc-ref:Location#end_character_offset) + # of the associated location object. + #-- + #: () -> Integer def end_character_offset location.end_character_offset end - # Delegates to the cached_start_code_units_offset of the associated location - # object. + # Delegates to [`cached_start_code_units_offset`](rdoc-ref:Location#cached_start_code_units_offset) + # of the associated location object. + #-- + #: (_CodeUnitsCache cache) -> Integer def cached_start_code_units_offset(cache) location.cached_start_code_units_offset(cache) end - # Delegates to the cached_end_code_units_offset of the associated location - # object. + # Delegates to [`cached_end_code_units_offset`](rdoc-ref:Location#cached_end_code_units_offset) + # of the associated location object. + #-- + #: (_CodeUnitsCache cache) -> Integer def cached_end_code_units_offset(cache) location.cached_end_code_units_offset(cache) end - # Delegates to the start_column of the associated location object. + # Delegates to [`start_column`](rdoc-ref:Location#start_column) of the associated location object. + #-- + #: () -> Integer def start_column location.start_column end - # Delegates to the end_column of the associated location object. + # Delegates to [`end_column`](rdoc-ref:Location#end_column) of the associated location object. + #-- + #: () -> Integer def end_column location.end_column end - # Delegates to the start_character_column of the associated location object. + # Delegates to [`start_character_column`](rdoc-ref:Location#start_character_column) + # of the associated location object. + #-- + #: () -> Integer def start_character_column location.start_character_column end - # Delegates to the end_character_column of the associated location object. + # Delegates to [`end_character_column`](rdoc-ref:Location#end_character_column) + # of the associated location object. + #-- + #: () -> Integer def end_character_column location.end_character_column end - # Delegates to the cached_start_code_units_column of the associated location - # object. + # Delegates to [`cached_start_code_units_column`](rdoc-ref:Location#cached_start_code_units_column) + # of the associated location object. + #-- + #: (_CodeUnitsCache cache) -> Integer def cached_start_code_units_column(cache) location.cached_start_code_units_column(cache) end - # Delegates to the cached_end_code_units_column of the associated location - # object. + # Delegates to [`cached_end_code_units_column`](rdoc-ref:Location#cached_end_code_units_column) + # of the associated location object. + #-- + #: (_CodeUnitsCache cache) -> Integer def cached_end_code_units_column(cache) location.cached_end_code_units_column(cache) end - # Delegates to the leading_comments of the associated location object. + # Delegates to [`leading_comments`](rdoc-ref:Location#leading_comments) of the associated location object. + #-- + #: () -> Array[Comment] def leading_comments location.leading_comments end - # Delegates to the trailing_comments of the associated location object. + # Delegates to [`trailing_comments`](rdoc-ref:Location#trailing_comments) of the associated location object. + #-- + #: () -> Array[Comment] def trailing_comments location.trailing_comments end - # Delegates to the comments of the associated location object. + # Delegates to [`comments`](rdoc-ref:Location#comments) of the associated location object. + #-- + #: () -> Array[Comment] def comments location.comments end + # :section: + # Returns all of the lines of the source code associated with this node. + #-- + #: () -> Array[String] def source_lines location.source_lines end @@ -133,6 +205,8 @@ module Prism alias script_lines source_lines # Slice the location of the node from the source. + #-- + #: () -> String def slice location.slice end @@ -140,28 +214,38 @@ module Prism # Slice the location of the node from the source, starting at the beginning # of the line that the location starts on, ending at the end of the line # that the location ends on. + #-- + #: () -> String def slice_lines location.slice_lines end # An bitset of flags for this node. There are certain flags that are common # for all nodes, and then some nodes have specific flags. - attr_reader :flags + # :stopdoc: + attr_reader :flags #: Integer protected :flags + # :startdoc: # Returns true if the node has the newline flag set. + #-- + #: () -> bool def newline? flags.anybits?(NodeFlags::NEWLINE) end # Returns true if the node has the static literal flag set. + #-- + #: () -> bool def static_literal? flags.anybits?(NodeFlags::STATIC_LITERAL) end # Similar to inspect, but respects the current level of indentation given by # the pretty print object. - def pretty_print(q) + #-- + #: (PP q) -> void + def pretty_print(q) # :nodoc: q.seplist(inspect.chomp.each_line, -> { q.breakable }) do |line| q.text(line.chomp) end @@ -169,6 +253,8 @@ module Prism end # Convert this node into a graphviz dot graph string. + #-- + #: () -> String def to_dot # @type self: node DotVisitor.new.tap { |visitor| accept(visitor) }.to_dot @@ -180,28 +266,18 @@ module Prism # # Important to note is that the column given to this method should be in # bytes, as opposed to characters or code units. + #-- + #: (Integer line, Integer column) -> Array[node] def tunnel(line, column) - queue = [self] #: Array[Prism::node] - result = [] #: Array[Prism::node] + queue = [self] #: Array[node] + result = [] #: Array[node] + offset = source.byte_offset(line, column) while (node = queue.shift) result << node - node.compact_child_nodes.each do |child_node| - child_location = child_node.location - - start_line = child_location.start_line - end_line = child_location.end_line - - if start_line == end_line - if line == start_line && column >= child_location.start_column && column < child_location.end_column - queue << child_node - break - end - elsif (line == start_line && column >= child_location.start_column) || (line == end_line && column < child_location.end_column) - queue << child_node - break - elsif line > start_line && line < end_line + node.each_child_node do |child_node| + if child_node.start_offset <= offset && offset < child_node.end_offset queue << child_node break end @@ -212,13 +288,14 @@ module Prism end # Returns the first node that matches the given block when visited in a - # depth-first search. This is useful for finding a node that matches a + # breadth-first search. This is useful for finding a node that matches a # particular condition. # # node.breadth_first_search { |node| node.node_id == node_id } - # - def breadth_first_search(&block) - queue = [self] #: Array[Prism::node] + #-- + #: () { (node) -> bool } -> node? + def breadth_first_search(&blk) + queue = [self] #: Array[node] while (node = queue.shift) return node if yield node @@ -227,10 +304,33 @@ module Prism nil end + alias find breadth_first_search + + # Returns all of the nodes that match the given block when visited in a + # breadth-first search. This is useful for finding all nodes that match a + # particular condition. + # + # node.breadth_first_search_all { |node| node.is_a?(Prism::CallNode) } + #-- + #: () { (node) -> bool } -> Array[node] + def breadth_first_search_all(&blk) + queue = [self] #: Array[Prism::node] + results = [] #: Array[Prism::node] + + while (node = queue.shift) + results << node if yield node + queue.concat(node.compact_child_nodes) + end + + results + end + alias find_all breadth_first_search_all # Returns a list of the fields that exist for this node class. Fields # describe the structure of the node. This kind of reflection is useful for # things like recursively visiting each node _and_ field in the tree. + #-- + #: () -> Array[Reflection::Field] def self.fields # This method should only be called on subclasses of Node, not Node # itself. @@ -240,38 +340,57 @@ module Prism end # -------------------------------------------------------------------------- - # :section: Node interface - # These methods are effectively abstract methods that must be implemented by - # the various subclasses of Node. They are here to make it easier to work - # with typecheckers. + # :section: Node Interface + # These methods are effectively abstract methods that are implemented by + # the various subclasses of Node. # -------------------------------------------------------------------------- # Accepts a visitor and calls back into the specialized visit function. + #-- + #: (_Visitor visitor) -> untyped def accept(visitor) raise NoMethodError, "undefined method `accept' for #{inspect}" end # Returns an array of child nodes, including `nil`s in the place of optional # nodes that were not present. + #-- + #: () -> Array[node?] def child_nodes raise NoMethodError, "undefined method `child_nodes' for #{inspect}" end alias deconstruct child_nodes + # With a block given, yields each child node. Without a block, returns + # an enumerator that contains each child node. Excludes any `nil`s in + # the place of optional nodes that were not present. + #-- + #: () -> Enumerator[node, void] + #: () { (node) -> void } -> void + def each_child_node(&blk) + raise NoMethodError, "undefined method `each_child_node' for #{inspect}" + end + # Returns an array of child nodes, excluding any `nil`s in the place of # optional nodes that were not present. + #-- + #: () -> Array[node] def compact_child_nodes raise NoMethodError, "undefined method `compact_child_nodes' for #{inspect}" end # Returns an array of child nodes and locations that could potentially have # comments attached to them. + #-- + #: () -> Array[node | Location] def comment_targets raise NoMethodError, "undefined method `comment_targets' for #{inspect}" end # Returns a string representation of the node. + #-- + #: () -> String def inspect raise NoMethodError, "undefined method `inspect' for #{inspect}" end @@ -288,6 +407,8 @@ module Prism # it uses a single integer comparison, but also because if you're on CRuby # you can take advantage of the fact that case statements with all symbol # keys will use a jump table. + #-- + #: () -> Symbol def type raise NoMethodError, "undefined method `type' for #{inspect}" end @@ -296,6 +417,8 @@ module Prism # splitting on the type of the node without having to do a long === chain. # Note that like #type, it will still be slower than using == for a single # class, but should be faster in a case statement or an array comparison. + #-- + #: () -> Symbol def self.type raise NoMethodError, "undefined method `type' for #{inspect}" end @@ -306,7 +429,13 @@ module Prism #<%= line %> <%- end -%> class <%= node.name -%> < Node + <%- node.fields.each do |field| -%> + # @rbs @<%= field.name %>: <%= field.rbs_class %> + <%- end -%> + # Initialize a new <%= node.name %> node. + #-- + #: (Source source, Integer node_id, Location location, Integer flags, <%= node.fields.map { |field| "#{field.rbs_class} #{field.name}" }.join(", ") %>) -> void def initialize(<%= ["source", "node_id", "location", "flags", *node.fields.map(&:name)].join(", ") %>) @source = source @node_id = node_id @@ -320,12 +449,27 @@ module Prism <%- end -%> end - # def accept: (Visitor visitor) -> void + # --------- + # :section: Repository + # Methods related to Relocation. + # --------- + + # ---------------------------------------------------------------------------------- + # :section: Node Interface + # These methods are present on all subclasses of Node. + # Read the [node interface docs](Node.html#node-interface) for more information. + # ---------------------------------------------------------------------------------- + + # See Node.accept. + #-- + #: (_Visitor visitor) -> untyped def accept(visitor) visitor.visit_<%= node.human %>(self) end - # def child_nodes: () -> Array[Node?] + # See Node.child_nodes. + #-- + #: () -> Array[node?] def child_nodes [<%= node.fields.map { |field| case field @@ -335,7 +479,28 @@ module Prism }.compact.join(", ") %>] end - # def compact_child_nodes: () -> Array[Node] + # See Node.each_child_node. + #-- + #: () -> Enumerator[node, void] + #: () { (node) -> void } -> void + def each_child_node(&blk) + return to_enum(:each_child_node) unless block_given? + + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + yield <%= field.name %> + <%- when Prism::Template::OptionalNodeField -%> + if (<%= field.name %> = self.<%= field.name %>); yield <%= field.name %>; end + <%- when Prism::Template::NodeListField -%> + <%= field.name %>.each { |node| yield node } + <%- end -%> + <%- end -%> + end + + # See Node.compact_child_nodes. + #-- + #: () -> Array[node] def compact_child_nodes <%- if node.fields.any? { |field| field.is_a?(Prism::Template::OptionalNodeField) } -%> compact = [] #: Array[Prism::node] @@ -344,7 +509,7 @@ module Prism <%- when Prism::Template::NodeField -%> compact << <%= field.name %> <%- when Prism::Template::OptionalNodeField -%> - compact << <%= field.name %> if <%= field.name %> + if (<%= field.name %> = self.<%= field.name %>); compact << <%= field.name %>; end <%- when Prism::Template::NodeListField -%> compact.concat(<%= field.name %>) <%- end -%> @@ -360,7 +525,9 @@ module Prism <%- end -%> end - # def comment_targets: () -> Array[Node | Location] + # See Node.comment_targets. + #-- + #: () -> Array[node | Location] def comment_targets [<%= node.fields.map { |field| case field @@ -370,50 +537,101 @@ module Prism }.compact.join(", ") %>] #: Array[Prism::node | Location] end - # def copy: (<%= (["?node_id: Integer", "?location: Location", "?flags: Integer"] + node.fields.map { |field| "?#{field.name}: #{field.rbs_class}" }).join(", ") %>) -> <%= node.name %> + # :call-seq: + # copy(**fields) -> <%= node.name %> + # + # Creates a copy of self with the given fields, using self as the template. + #-- + #: (?node_id: Integer, ?location: Location, ?flags: Integer, <%= node.fields.map { |field| "?#{field.name}: #{field.rbs_class}" }.join(", ") %>) -> <%= node.name %> def copy(<%= (["node_id", "location", "flags"] + node.fields.map(&:name)).map { |field| "#{field}: self.#{field}" }.join(", ") %>) <%= node.name %>.new(<%= ["source", "node_id", "location", "flags", *node.fields.map(&:name)].join(", ") %>) end - # def deconstruct: () -> Array[Node?] alias deconstruct child_nodes - # def deconstruct_keys: (Array[Symbol] keys) -> { <%= (["node_id: Integer", "location: Location"] + node.fields.map { |field| "#{field.name}: #{field.rbs_class}" }).join(", ") %> } - def deconstruct_keys(keys) + #: (Array[Symbol]? keys) -> Hash[Symbol, untyped] + def deconstruct_keys(keys) # :nodoc: { <%= (["node_id: node_id", "location: location"] + node.fields.map { |field| "#{field.name}: #{field.name}" }).join(", ") %> } end + + # See `Node#type`. + #-- + #: () -> :<%= node.human %> + def type + :<%= node.human %> + end + + # See `Node.type`. + #-- + #: () -> :<%= node.human %> + def self.type + :<%= node.human %> + end + + #: () -> String + def inspect # :nodoc: + InspectVisitor.compose(self) + end + + # :section: + <%- if (node_flags = node.flags) -%> <%- node_flags.values.each do |value| -%> - - # def <%= value.name.downcase %>?: () -> bool + # :category: Flags + # <%= value.comment %> + #-- + #: () -> bool def <%= value.name.downcase %>? flags.anybits?(<%= node_flags.name %>::<%= value.name %>) end + <%- end -%> <%- end -%> <%- node.fields.each do |field| -%> - + <%- case field -%> + <%- when Prism::Template::LocationField -%> + # :category: Locations + # :call-seq: + # <%= field.name %> -> <%= field.call_seq_type %> + # <%- if field.comment.nil? -%> - # attr_reader <%= field.name %>: <%= field.rbs_class %> + # Returns the Location represented by `<%= field.name %>`. <%- else -%> <%- field.each_comment_line do |line| -%> #<%= line %> <%- end -%> <%- end -%> - <%- case field -%> - <%- when Prism::Template::LocationField -%> + #-- + #: () -> Location def <%= field.name %> location = @<%= field.name %> return location if location.is_a?(Location) @<%= field.name %> = Location.new(source, location >> 32, location & 0xFFFFFFFF) end + # :category: Repository # Save the <%= field.name %> location using the given saved source so that # it can be retrieved later. + #-- + #: (_Repository repository) -> Relocation::Entry def save_<%= field.name %>(repository) repository.enter(node_id, :<%= field.name %>) end + <%- when Prism::Template::OptionalLocationField -%> + # :category: Locations + # :call-seq: + # <%= field.name %> -> <%= field.call_seq_type %> + # + <%- if field.comment.nil? -%> + # Returns the Location represented by `<%= field.name %>`. + <%- else -%> + <%- field.each_comment_line do |line| -%> + #<%= line %> + <%- end -%> + <%- end -%> + #-- + #: () -> Location? def <%= field.name %> location = @<%= field.name %> case location @@ -426,54 +644,69 @@ module Prism end end + # :category: Repository # Save the <%= field.name %> location using the given saved source so that # it can be retrieved later. + #-- + #: (_Repository repository) -> Relocation::Entry? def save_<%= field.name %>(repository) repository.enter(node_id, :<%= field.name %>) unless @<%= field.name %>.nil? end <%- else -%> - attr_reader :<%= field.name %> + # :call-seq: + # <%= field.name %> -> <%= field.call_seq_type %> + # + <%- if field.comment.nil? -%> + # Returns the `<%= field.name %>` attribute. + <%- else -%> + <%- field.each_comment_line do |line| -%> + #<%= line %> <%- end -%> <%- end -%> + #-- + #: () -> <%= field.rbs_class %> + def <%= field.name %> + @<%= field.name %> + end + + <%- end -%> + <%- end -%> + # :section: Slicing + <%- node.fields.each do |field| -%> <%- case field -%> <%- when Prism::Template::LocationField -%> <%- raise unless field.name.end_with?("_loc") -%> <%- next if node.fields.any? { |other| other.name == field.name.delete_suffix("_loc") } -%> - - # def <%= field.name.delete_suffix("_loc") %>: () -> String + # :call-seq: + # <%= field.name.delete_suffix("_loc") %> -> String + # + # Slice the location of <%= field.name %> from the source. + #-- + #: () -> String def <%= field.name.delete_suffix("_loc") %> <%= field.name %>.slice end + <%- when Prism::Template::OptionalLocationField -%> <%- raise unless field.name.end_with?("_loc") -%> <%- next if node.fields.any? { |other| other.name == field.name.delete_suffix("_loc") } -%> - - # def <%= field.name.delete_suffix("_loc") %>: () -> String? + # :call-seq: + # <%= field.name.delete_suffix("_loc") %> -> String | nil + # + # Slice the location of <%= field.name %> from the source. + #-- + #: () -> String? def <%= field.name.delete_suffix("_loc") %> <%= field.name %>&.slice end + <%- end -%> <%- end -%> + # :section: - # def inspect -> String - def inspect - InspectVisitor.compose(self) - end - - # Return a symbol representation of this node type. See `Node#type`. - def type - :<%= node.human %> - end - - # Return a symbol representation of this node type. See `Node::type`. - def self.type - :<%= node.human %> - end - - # Implements case-equality for the node. This is effectively == but without - # comparing the value of locations. Locations are checked only for presence. - def ===(other) + #: (untyped other) -> boolish + def ===(other) # :nodoc: other.is_a?(<%= node.name %>)<%= " &&" if (fields = [*node.flags, *node.fields]).any? %> <%- fields.each_with_index do |field, index| -%> <%- if field.is_a?(Prism::Template::LocationField) || field.is_a?(Prism::Template::OptionalLocationField) -%> diff --git a/prism/templates/lib/prism/reflection.rb.erb b/prism/templates/lib/prism/reflection.rb.erb index 6c8b2f4d25..0012f120b2 100644 --- a/prism/templates/lib/prism/reflection.rb.erb +++ b/prism/templates/lib/prism/reflection.rb.erb @@ -1,3 +1,6 @@ +#-- +# rbs_inline: enabled + module Prism # The Reflection module provides the ability to reflect on the structure of # the syntax tree itself, as opposed to looking at a single syntax tree. This @@ -7,9 +10,11 @@ module Prism # for all other field types. class Field # The name of the field. - attr_reader :name + attr_reader :name #: Symbol # Initializes the field with the given name. + #-- + #: (Symbol name) -> void def initialize(name) @name = name end @@ -83,9 +88,11 @@ module Prism # the bitset should be accessed through their query methods. class FlagsField < Field # The names of the flags in the bitset. - attr_reader :flags + attr_reader :flags #: Array[Symbol] # Initializes the flags field with the given name and flags. + #-- + #: (Symbol name, Array[Symbol] flags) -> void def initialize(name, flags) super(name) @flags = flags @@ -93,6 +100,8 @@ module Prism end # Returns the fields for the given node. + #-- + #: (singleton(Node) node) -> Array[Field] def self.fields_for(node) case node.type <%- nodes.each do |node| -%> diff --git a/prism/templates/lib/prism/serialize.rb.erb b/prism/templates/lib/prism/serialize.rb.erb index 104b60f484..a676f957af 100644 --- a/prism/templates/lib/prism/serialize.rb.erb +++ b/prism/templates/lib/prism/serialize.rb.erb @@ -1,16 +1,19 @@ +#-- +# rbs_inline: enabled + require "stringio" require_relative "polyfill/unpack1" module Prism # A module responsible for deserializing parse results. - module Serialize + module Serialize # :nodoc: # The major version of prism that we are expecting to find in the serialized # strings. MAJOR_VERSION = 1 # The minor version of prism that we are expecting to find in the serialized # strings. - MINOR_VERSION = 4 + MINOR_VERSION = 9 # The patch version of prism that we are expecting to find in the serialized # strings. @@ -20,9 +23,11 @@ module Prism # # The formatting of the source of this method is purposeful to illustrate # the structure of the serialized data. + #-- + #: (String input, String serialized, bool freeze) -> ParseResult def self.load_parse(input, serialized, freeze) input = input.dup - source = Source.for(input) + source = Source.for(input, 1, []) loader = Loader.new(source, serialized) loader.load_header @@ -38,16 +43,17 @@ module Prism data_loc = loader.load_optional_location_object(freeze) errors = loader.load_errors(encoding, freeze) warnings = loader.load_warnings(encoding, freeze) + continuable = loader.load_bool cpool_base = loader.load_uint32 cpool_size = loader.load_varuint - constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size) + constant_pool = ConstantPool.new(serialized, cpool_base, cpool_size) - node = loader.load_node(constant_pool, encoding, freeze) + node = loader.load_node(constant_pool, encoding, freeze) #: ProgramNode loader.load_constant_pool(constant_pool) raise unless loader.eof? - result = ParseResult.new(node, comments, magic_comments, data_loc, errors, warnings, source) + result = ParseResult.new(node, comments, magic_comments, data_loc, errors, warnings, continuable, source) result.freeze if freeze input.force_encoding(encoding) @@ -73,8 +79,10 @@ module Prism # # The formatting of the source of this method is purposeful to illustrate # the structure of the serialized data. + #-- + #: (String input, String serialized, bool freeze) -> LexResult def self.load_lex(input, serialized, freeze) - source = Source.for(input) + source = Source.for(input, 1, []) loader = Loader.new(source, serialized) tokens = loader.load_tokens @@ -90,9 +98,10 @@ module Prism data_loc = loader.load_optional_location_object(freeze) errors = loader.load_errors(encoding, freeze) warnings = loader.load_warnings(encoding, freeze) + continuable = loader.load_bool raise unless loader.eof? - result = LexResult.new(tokens, comments, magic_comments, data_loc, errors, warnings, source) + result = LexResult.new(tokens, comments, magic_comments, data_loc, errors, warnings, continuable, source) tokens.each do |token| token[0].value.force_encoding(encoding) @@ -117,8 +126,10 @@ module Prism # # The formatting of the source of this method is purposeful to illustrate # the structure of the serialized data. + #-- + #: (String input, String serialized, bool freeze) -> Array[Comment] def self.load_parse_comments(input, serialized, freeze) - source = Source.for(input) + source = Source.for(input, 1, []) loader = Loader.new(source, serialized) loader.load_header @@ -139,8 +150,10 @@ module Prism # # The formatting of the source of this method is purposeful to illustrate # the structure of the serialized data. + #-- + #: (String input, String serialized, bool freeze) -> ParseLexResult def self.load_parse_lex(input, serialized, freeze) - source = Source.for(input) + source = Source.for(input, 1, []) loader = Loader.new(source, serialized) tokens = loader.load_tokens @@ -157,17 +170,18 @@ module Prism data_loc = loader.load_optional_location_object(freeze) errors = loader.load_errors(encoding, freeze) warnings = loader.load_warnings(encoding, freeze) + continuable = loader.load_bool cpool_base = loader.load_uint32 cpool_size = loader.load_varuint - constant_pool = ConstantPool.new(input, serialized, cpool_base, cpool_size) + constant_pool = ConstantPool.new(serialized, cpool_base, cpool_size) - node = loader.load_node(constant_pool, encoding, freeze) + node = loader.load_node(constant_pool, encoding, freeze) #: ProgramNode loader.load_constant_pool(constant_pool) raise unless loader.eof? - value = [node, tokens] - result = ParseLexResult.new(value, comments, magic_comments, data_loc, errors, warnings, source) + value = [node, tokens] #: [ProgramNode, Array[[Token, Integer]]] + result = ParseLexResult.new(value, comments, magic_comments, data_loc, errors, warnings, continuable, source) tokens.each do |token| token[0].value.force_encoding(encoding) @@ -189,34 +203,36 @@ module Prism end class ConstantPool # :nodoc: - attr_reader :size + attr_reader :size #: Integer + + # @rbs @serialized: String + # @rbs @base: Integer + # @rbs @pool: Array[Symbol?] - def initialize(input, serialized, base, size) - @input = input + #: (String serialized, Integer base, Integer size) -> void + def initialize(serialized, base, size) @serialized = serialized @base = base @size = size @pool = Array.new(size, nil) end + #: (Integer index, Encoding encoding) -> Symbol def get(index, encoding) @pool[index] ||= begin offset = @base + index * 8 - start = @serialized.unpack1("L", offset: offset) - length = @serialized.unpack1("L", offset: offset + 4) + start = @serialized.unpack1("L", offset: offset) #: Integer + length = @serialized.unpack1("L", offset: offset + 4) #: Integer - if start.nobits?(1 << 31) - @input.byteslice(start, length).force_encoding(encoding).to_sym - else - @serialized.byteslice(start & ((1 << 31) - 1), length).force_encoding(encoding).to_sym - end + (@serialized.byteslice(start, length) or raise).force_encoding(encoding).to_sym end end end if RUBY_ENGINE == "truffleruby" # StringIO is synchronized and that adds a high overhead on TruffleRuby. + # @rbs skip class FastStringIO # :nodoc: attr_accessor :pos @@ -246,8 +262,11 @@ module Prism end class Loader # :nodoc: - attr_reader :input, :io, :source + attr_reader :input #: String + attr_reader :io #: StringIO + attr_reader :source #: Source + #: (Source source, String serialized) -> void def initialize(source, serialized) @input = source.source.dup raise unless serialized.encoding == Encoding::BINARY @@ -256,40 +275,46 @@ module Prism define_load_node_lambdas if RUBY_ENGINE != "ruby" end + #: () -> bool def eof? io.getbyte io.eof? end + #: (ConstantPool constant_pool) -> void def load_constant_pool(constant_pool) trailer = 0 constant_pool.size.times do |index| - start, length = io.read(8).unpack("L2") - trailer += length if start.anybits?(1 << 31) + length = (io.read(8) or raise).unpack1("L", offset: 4) #: Integer + trailer += length end io.read(trailer) end + #: () -> void def load_header raise "Invalid serialization" if io.read(5) != "PRISM" - raise "Invalid serialization" if io.read(3).unpack("C3") != [MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION] + raise "Invalid serialization" if (io.read(3) or raise).unpack("C3") != [MAJOR_VERSION, MINOR_VERSION, PATCH_VERSION] raise "Invalid serialization (location fields must be included but are not)" if io.getbyte != 0 end + #: () -> Encoding def load_encoding - encoding = Encoding.find(io.read(load_varuint)) + encoding = Encoding.find((io.read(load_varuint) or raise)) or raise @input = input.force_encoding(encoding).freeze encoding end + #: (bool freeze) -> Array[Integer] def load_line_offsets(freeze) offsets = Array.new(load_varuint) { load_varuint } offsets.freeze if freeze offsets end + #: (bool freeze) -> Array[Comment] def load_comments(freeze) comments = Array.new(load_varuint) do @@ -297,6 +322,7 @@ module Prism case load_varuint when 0 then InlineComment.new(load_location_object(freeze)) when 1 then EmbDocComment.new(load_location_object(freeze)) + else raise end comment.freeze if freeze @@ -307,6 +333,7 @@ module Prism comments end + #: (bool freeze) -> Array[MagicComment] def load_magic_comments(freeze) magic_comments = Array.new(load_varuint) do @@ -331,10 +358,11 @@ module Prism <%- warnings.each do |warning| -%> <%= warning.name.downcase.to_sym.inspect %>, <%- end -%> - ].freeze + ].freeze #: Array[Symbol] private_constant :DIAGNOSTIC_TYPES + #: () -> Symbol def load_error_level level = io.getbyte @@ -350,13 +378,14 @@ module Prism end end + #: (Encoding encoding, bool freeze) -> Array[ParseError] def load_errors(encoding, freeze) errors = Array.new(load_varuint) do error = ParseError.new( DIAGNOSTIC_TYPES.fetch(load_varuint), - load_embedded_string(encoding), + load_string(encoding), load_location_object(freeze), load_error_level ) @@ -369,6 +398,7 @@ module Prism errors end + #: () -> Symbol def load_warning_level level = io.getbyte @@ -382,13 +412,14 @@ module Prism end end + #: (Encoding encoding, bool freeze) -> Array[ParseWarning] def load_warnings(encoding, freeze) warnings = Array.new(load_varuint) do warning = ParseWarning.new( DIAGNOSTIC_TYPES.fetch(load_varuint), - load_embedded_string(encoding), + load_string(encoding), load_location_object(freeze), load_warning_level ) @@ -401,15 +432,15 @@ module Prism warnings end + #: () -> Array[[Token, Integer]] def load_tokens - tokens = [] + tokens = [] #: Array[[Token, Integer]] while (type = TOKEN_TYPES.fetch(load_varuint)) - start = load_varuint - length = load_varuint + location = load_location_object(false) + lex_state = load_varuint - location = Location.new(@source, start, length) token = Token.new(@source, type, location.slice, location) tokens << [token, lex_state] @@ -420,25 +451,29 @@ module Prism # variable-length integer using https://en.wikipedia.org/wiki/LEB128 # This is also what protobuf uses: https://protobuf.dev/programming-guides/encoding/#varints + #-- + #: () -> Integer def load_varuint - n = io.getbyte + n = (io.getbyte or raise) if n < 128 n else n -= 128 shift = 0 - while (b = io.getbyte) >= 128 + while (b = (io.getbyte or raise)) >= 128 n += (b - 128) << (shift += 7) end n + (b << (shift + 7)) end end + #: () -> Integer def load_varsint n = load_varuint (n >> 1) ^ (-(n & 1)) end + #: () -> Integer def load_integer negative = io.getbyte != 0 length = load_varuint @@ -450,14 +485,22 @@ module Prism value end + #: () -> Float def load_double - io.read(8).unpack1("D") + (io.read(8) or raise).unpack1("D") #: Float end + #: () -> bool + def load_bool + (io.getbyte or raise) != 0 + end + + #: () -> Integer def load_uint32 - io.read(4).unpack1("L") + (io.read(4) or raise).unpack1("L") #: Integer end + #: (ConstantPool constant_pool, Encoding encoding, bool freeze) -> node? def load_optional_node(constant_pool, encoding, freeze) if io.getbyte != 0 io.pos -= 1 @@ -465,90 +508,121 @@ module Prism end end - def load_embedded_string(encoding) - io.read(load_varuint).force_encoding(encoding).freeze - end - + #: (Encoding encoding) -> String def load_string(encoding) - case (type = io.getbyte) - when 1 - input.byteslice(load_varuint, load_varuint).force_encoding(encoding).freeze - when 2 - load_embedded_string(encoding) - else - raise "Unknown serialized string type: #{type}" - end + (io.read(load_varuint) or raise).force_encoding(encoding).freeze end + #: (bool freeze) -> Location def load_location_object(freeze) location = Location.new(source, load_varuint, load_varuint) location.freeze if freeze location end + # Load a location object from the serialized data. Note that we are lying + # about the signature a bit here, because we sometimes load it as a packed + # integer instead of an object. + #-- + #: (bool freeze) -> Location def load_location(freeze) return load_location_object(freeze) if freeze - (load_varuint << 32) | load_varuint + (load_varuint << 32) | load_varuint #: Location end + # Load an optional location object from the serialized data if it is + # present. Note that we are lying about the signature a bit here, because + # we sometimes load it as a packed integer instead of an object. + #-- + #: (bool freeze) -> Location? def load_optional_location(freeze) load_location(freeze) if io.getbyte != 0 end + #: (bool freeze) -> Location? def load_optional_location_object(freeze) load_location_object(freeze) if io.getbyte != 0 end + #: (ConstantPool constant_pool, Encoding encoding) -> Symbol def load_constant(constant_pool, encoding) index = load_varuint constant_pool.get(index - 1, encoding) end + #: (ConstantPool constant_pool, Encoding encoding) -> Symbol? def load_optional_constant(constant_pool, encoding) index = load_varuint constant_pool.get(index - 1, encoding) if index != 0 end if RUBY_ENGINE == "ruby" + #: (ConstantPool constant_pool, Encoding encoding, bool freeze) -> node def load_node(constant_pool, encoding, freeze) type = io.getbyte node_id = load_varuint - location = load_location(freeze) - value = case type - <%- nodes.each_with_index do |node, index| -%> - when <%= index + 1 %> then - <%- if node.needs_serialized_length? -%> - load_uint32 - <%- end -%> - <%= node.name %>.new(<%= ["source", "node_id", "location", "load_varuint", *node.fields.map { |field| - case field - when Prism::Template::NodeField then "load_node(constant_pool, encoding, freeze)" - when Prism::Template::OptionalNodeField then "load_optional_node(constant_pool, encoding, freeze)" - when Prism::Template::StringField then "load_string(encoding)" - when Prism::Template::NodeListField then "Array.new(load_varuint) { load_node(constant_pool, encoding, freeze) }.tap { |nodes| nodes.freeze if freeze }" - when Prism::Template::ConstantField then "load_constant(constant_pool, encoding)" - when Prism::Template::OptionalConstantField then "load_optional_constant(constant_pool, encoding)" - when Prism::Template::ConstantListField then "Array.new(load_varuint) { load_constant(constant_pool, encoding) }.tap { |constants| constants.freeze if freeze }" - when Prism::Template::LocationField then "load_location(freeze)" - when Prism::Template::OptionalLocationField then "load_optional_location(freeze)" - when Prism::Template::UInt8Field then "io.getbyte" - when Prism::Template::UInt32Field then "load_varuint" - when Prism::Template::IntegerField then "load_integer" - when Prism::Template::DoubleField then "load_double" - else raise - end - }].join(", ") -%>) + location = load_location(freeze) #: Location + value = + case type + <%- nodes.each_with_index do |node, index| -%> + when <%= index + 1 %> + <%- if node.needs_serialized_length? -%> + load_uint32 + <%- end -%> + <%= node.name %>.new( + source, + node_id, + location, + load_varuint, + <%- node.fields.each do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + load_node(constant_pool, encoding, freeze), #: <%= field.rbs_class %> + <%- when Prism::Template::OptionalNodeField -%> + load_optional_node(constant_pool, encoding, freeze), #: <%= field.rbs_class %> + <%- when Prism::Template::StringField -%> + load_string(encoding), + <%- when Prism::Template::NodeListField -%> + Array.new(load_varuint) do + load_node(constant_pool, encoding, freeze) #: <%= field.element_rbs_class %> + end.tap { |nodes| nodes.freeze if freeze }, + <%- when Prism::Template::ConstantField -%> + load_constant(constant_pool, encoding), + <%- when Prism::Template::OptionalConstantField -%> + load_optional_constant(constant_pool, encoding), + <%- when Prism::Template::ConstantListField -%> + Array.new(load_varuint) { load_constant(constant_pool, encoding) }.tap { |constants| constants.freeze if freeze }, + <%- when Prism::Template::LocationField -%> + load_location(freeze), + <%- when Prism::Template::OptionalLocationField -%> + load_optional_location(freeze), + <%- when Prism::Template::UInt8Field -%> + (io.getbyte or raise), + <%- when Prism::Template::UInt32Field -%> + load_varuint, + <%- when Prism::Template::IntegerField -%> + load_integer, + <%- when Prism::Template::DoubleField -%> + load_double, + <%- else raise -%> + <%- end -%> + <%- end -%> + ) <%- end -%> - end + else + raise "Unknown node type: #{type}" + end value.freeze if freeze value end else + # @rbs skip def load_node(constant_pool, encoding, freeze) - @load_node_lambdas[io.getbyte].call(constant_pool, encoding, freeze) + @load_node_lambdas[(io.getbyte or raise)].call(constant_pool, encoding, freeze) end + # @rbs skip def define_load_node_lambdas @load_node_lambdas = [ nil, @@ -559,24 +633,46 @@ module Prism <%- if node.needs_serialized_length? -%> load_uint32 <%- end -%> - value = <%= node.name %>.new(<%= ["source", "node_id", "location", "load_varuint", *node.fields.map { |field| - case field - when Prism::Template::NodeField then "load_node(constant_pool, encoding, freeze)" - when Prism::Template::OptionalNodeField then "load_optional_node(constant_pool, encoding, freeze)" - when Prism::Template::StringField then "load_string(encoding)" - when Prism::Template::NodeListField then "Array.new(load_varuint) { load_node(constant_pool, encoding, freeze) }" - when Prism::Template::ConstantField then "load_constant(constant_pool, encoding)" - when Prism::Template::OptionalConstantField then "load_optional_constant(constant_pool, encoding)" - when Prism::Template::ConstantListField then "Array.new(load_varuint) { load_constant(constant_pool, encoding) }" - when Prism::Template::LocationField then "load_location(freeze)" - when Prism::Template::OptionalLocationField then "load_optional_location(freeze)" - when Prism::Template::UInt8Field then "io.getbyte" - when Prism::Template::UInt32Field then "load_varuint" - when Prism::Template::IntegerField then "load_integer" - when Prism::Template::DoubleField then "load_double" - else raise - end - }].join(", ") -%>) + value = + <%= node.name %>.new( + source, + node_id, + location, + load_varuint, + <%- node.fields.map do |field| -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + load_node(constant_pool, encoding, freeze), #: <%= field.rbs_class %> + <%- when Prism::Template::OptionalNodeField -%> + load_optional_node(constant_pool, encoding, freeze), #: <%= field.rbs_class %> + <%- when Prism::Template::StringField -%> + load_string(encoding), + <%- when Prism::Template::NodeListField -%> + Array.new(load_varuint) do + load_node(constant_pool, encoding, freeze) #: <%= field.element_rbs_class %> + end, + <%- when Prism::Template::ConstantField -%> + load_constant(constant_pool, encoding), + <%- when Prism::Template::OptionalConstantField -%> + load_optional_constant(constant_pool, encoding), + <%- when Prism::Template::ConstantListField -%> + Array.new(load_varuint) { load_constant(constant_pool, encoding) }, + <%- when Prism::Template::LocationField -%> + load_location(freeze), + <%- when Prism::Template::OptionalLocationField -%> + load_optional_location(freeze), + <%- when Prism::Template::UInt8Field -%> + (io.getbyte or raise), + <%- when Prism::Template::UInt32Field -%> + load_varuint, + <%- when Prism::Template::IntegerField -%> + load_integer, + <%- when Prism::Template::DoubleField -%> + load_double, + <%- else raise -%> + <%- end -%> + <%- end -%> + ) value.freeze if freeze value }, @@ -584,6 +680,10 @@ module Prism ] end end + + # @rbs! + # @load_node_lambdas: Array[Proc] + # def define_load_node_lambdas: () -> void end # The token types that can be indexed by their enum values. @@ -592,7 +692,7 @@ module Prism <%- tokens.each do |token| -%> <%= token.name.to_sym.inspect %>, <%- end -%> - ].freeze + ].freeze #: Array[Symbol?] private_constant :MAJOR_VERSION, :MINOR_VERSION, :PATCH_VERSION private_constant :ConstantPool, :FastStringIO, :Loader, :TOKEN_TYPES diff --git a/prism/templates/lib/prism/visitor.rb.erb b/prism/templates/lib/prism/visitor.rb.erb index 4b30a1815b..f23e87d99e 100644 --- a/prism/templates/lib/prism/visitor.rb.erb +++ b/prism/templates/lib/prism/visitor.rb.erb @@ -1,4 +1,14 @@ +#-- +# rbs_inline: enabled + module Prism + # @rbs! + # interface _Visitor + # <% nodes.each do |node| %> + # def visit_<%= node.human %>: (<%= node.name %>) -> void + # <% end %> + # end + # A class that knows how to walk down the tree. None of the individual visit # methods are implemented on this visitor, so it forces the consumer to # implement each one that they need. For a default implementation that @@ -6,21 +16,27 @@ module Prism class BasicVisitor # Calls `accept` on the given node if it is not `nil`, which in turn should # call back into this visitor by calling the appropriate `visit_*` method. + #-- + #: (node? node) -> void def visit(node) # @type self: _Visitor node&.accept(self) end # Visits each node in `nodes` by calling `accept` on each one. + #-- + #: (Array[node?] nodes) -> void def visit_all(nodes) # @type self: _Visitor nodes.each { |node| node&.accept(self) } end # Visits the child nodes of `node` by calling `accept` on each one. + #-- + #: (node node) -> void def visit_child_nodes(node) # @type self: _Visitor - node.compact_child_nodes.each { |node| node.accept(self) } + node.each_child_node { |node| node.accept(self) } end end @@ -34,7 +50,7 @@ module Prism # # class FooCalls < Prism::Visitor # def visit_call_node(node) - # if node.name == "foo" + # if node.name == :foo # # Do something with the node # end # @@ -47,7 +63,11 @@ module Prism <%- nodes.each_with_index do |node, index| -%> <%= "\n" if index != 0 -%> # Visit a <%= node.name %> node - alias visit_<%= node.human %> visit_child_nodes + #-- + #: (<%= node.name %> node) -> void + def visit_<%= node.human %>(node) + node.each_child_node { |node| node.accept(self) } + end <%- end -%> end end diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index ce98dc5acd..0dea732869 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -1,4 +1,16 @@ -#include "prism/diagnostic.h" +#include "prism/internal/diagnostic.h" + +#include "prism/compiler/inline.h" + +#include "prism/internal/allocator.h" +#include "prism/internal/arena.h" +#include "prism/internal/list.h" + +#include <assert.h> +#include <inttypes.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> #define PM_DIAGNOSTIC_ID_MAX <%= errors.length + warnings.length %> @@ -75,16 +87,16 @@ typedef struct { * * `PM_WARNING_LEVEL_VERBOSE` - Warnings that appear with `-w`, as in `ruby -w -c -e 'code'`. */ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { - // Special error that can be replaced + /* Special error that can be replaced */ [PM_ERR_CANNOT_PARSE_EXPRESSION] = { "cannot parse the expression", PM_ERROR_LEVEL_SYNTAX }, - // Errors that should raise argument errors + /* Errors that should raise argument errors */ [PM_ERR_INVALID_ENCODING_MAGIC_COMMENT] = { "unknown or invalid encoding in the magic comment", PM_ERROR_LEVEL_ARGUMENT }, - // Errors that should raise load errors + /* Errors that should raise load errors */ [PM_ERR_SCRIPT_NOT_FOUND] = { "no Ruby script found in input", PM_ERROR_LEVEL_LOAD }, - // Errors that should raise syntax errors + /* Errors that should raise syntax errors */ [PM_ERR_ALIAS_ARGUMENT] = { "invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ALIAS_ARGUMENT_NUMBERED_REFERENCE] = { "invalid argument being passed to `alias`; can't make alias for the number variables", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_AMPAMPEQ_MULTI_ASSIGN] = { "unexpected `&&=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, @@ -102,6 +114,8 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_ARGUMENT_FORWARDING_UNBOUND] = { "unexpected `...` in an non-parenthesized call", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ARGUMENT_NO_FORWARDING_AMPERSAND] = { "unexpected `&`; no anonymous block parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES] = { "unexpected ... when the parent method is not forwarding", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES_LAMBDA] = { "unexpected ... in lambda argument", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ARGUMENT_NO_FORWARDING_ELLIPSES_BLOCK] = { "unexpected ... in block argument", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ARGUMENT_NO_FORWARDING_STAR] = { "unexpected `*`; no anonymous rest parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ARGUMENT_NO_FORWARDING_STAR_STAR] = { "unexpected `**`; no anonymous keyword rest parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ARGUMENT_SPLAT_AFTER_ASSOC_SPLAT] = { "unexpected `*` splat argument after a `**` keyword splat argument", PM_ERROR_LEVEL_SYNTAX }, @@ -144,7 +158,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_CONDITIONAL_WHILE_PREDICATE] = { "expected a predicate expression for the `while` statement", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_CONSTANT_PATH_COLON_COLON_CONSTANT] = { "expected a constant after the `::` operator", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_DEF_ENDLESS] = { "could not parse the endless method body", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_ENDLESS_PARAMETERS] = { "could not parse the endless method parameters", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_DEF_ENDLESS_SETTER] = { "invalid method name; a setter method cannot be defined in an endless method definition", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_DEF_ENDLESS_DO_BLOCK] = { "unexpected `do` for block in an endless method definition", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_DEF_NAME] = { "unexpected %s; expected a method name", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_DEF_PARAMS_TERM] = { "expected a delimiter to close the parameters", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_DEF_PARAMS_TERM_PAREN] = { "unexpected %s; expected a `)` to close the parameters", PM_ERROR_LEVEL_SYNTAX }, @@ -184,6 +200,8 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_EXPECT_FOR_DELIMITER] = { "unexpected %s; expected a 'do', newline, or ';' after the 'for' loop collection", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_EXPECT_IDENT_REQ_PARAMETER] = { "expected an identifier for the required parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_EXPECT_IN_DELIMITER] = { "expected a delimiter after the patterns of an `in` clause", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_LPAREN_AFTER_NOT_LPAREN] = { "expected a `(` immediately after `not`", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_EXPECT_LPAREN_AFTER_NOT_OTHER] = { "expected a `(` after `not`", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_EXPECT_LPAREN_REQ_PARAMETER] = { "expected a `(` to start a required parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_EXPECT_MESSAGE] = { "unexpected %s; expecting a message to send to the receiver", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_EXPECT_RBRACKET] = { "expected a matching `]`", PM_ERROR_LEVEL_SYNTAX }, @@ -298,6 +316,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_PARAMETER_UNEXPECTED_NO_KW] = { "unexpected **nil; no keywords marker disallowed after keywords", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PATTERN_ARRAY_MULTIPLE_RESTS] = { "unexpected multiple '*' rest patterns in an array pattern", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PATTERN_CAPTURE_DUPLICATE] = { "duplicated variable name", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_PATTERN_CAPTURE_IN_ALTERNATIVE] = { "variable capture in alternative pattern", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PATTERN_EXPRESSION_AFTER_BRACKET] = { "expected a pattern expression after the `[` operator", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PATTERN_EXPRESSION_AFTER_COMMA] = { "expected a pattern expression after `,`", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PATTERN_EXPRESSION_AFTER_HROCKET] = { "expected a pattern expression after `=>`", PM_ERROR_LEVEL_SYNTAX }, @@ -323,13 +342,15 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_ESCAPED_NON_ASCII_IN_UTF8] = { "escaped non ASCII character in UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, - [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_INVALID_CHAR_PROPERTY] = { "invalid character property name {%.*s}: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_PARSE_ERROR] = { "%s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_UNKNOWN_OPTIONS] = { "unknown regexp %s - %.*s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_REGEXP_TERM] = { "unterminated regexp meets end of file; expected a closing delimiter", PM_ERROR_LEVEL_SYNTAX }, - [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%.*s/", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_MODIFIER_VALUE] = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_RESCUE_TERM] = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_SYNTAX }, @@ -344,7 +365,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_STRING_INTERPOLATED_TERM] = { "unterminated string; expected a closing delimiter for the interpolated string", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_STRING_LITERAL_EOF] = { "unterminated string meets end of file", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_STRING_LITERAL_TERM] = { "unexpected %s, expected a string literal terminator", PM_ERROR_LEVEL_SYNTAX }, - [PM_ERR_SYMBOL_INVALID] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX }, // TODO expected symbol? prism.c ~9719 + [PM_ERR_SYMBOL_INVALID] = { "invalid symbol", PM_ERROR_LEVEL_SYNTAX }, /* TODO expected symbol? prism.c ~9719 */ [PM_ERR_SYMBOL_TERM_DYNAMIC] = { "unterminated quoted string; expected a closing delimiter for the dynamic symbol", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_SYMBOL_TERM_INTERPOLATED] = { "unterminated symbol; expected a closing delimiter for the interpolated symbol", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_TERNARY_COLON] = { "expected a `:` after the true expression of a ternary operator", PM_ERROR_LEVEL_SYNTAX }, @@ -358,6 +379,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_UNEXPECTED_INDEX_KEYWORDS] = { "unexpected keyword arg given in index assignment; keywords are not allowed in index assignment expressions", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_UNEXPECTED_LABEL] = { "unexpected label", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_UNEXPECTED_MULTI_WRITE] = { "unexpected multiple assignment; multiple assignment is not allowed in this context", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_UNEXPECTED_PARAMETER_DEFAULT_VALUE] = { "unexpected %s; expected a default value for a parameter", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_UNEXPECTED_RANGE_OPERATOR] = { "unexpected range operator; .. and ... are non-associative and cannot be chained", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_UNEXPECTED_SAFE_NAVIGATION] = { "&. inside multiple assignment destination", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_UNEXPECTED_TOKEN_CLOSE_CONTEXT] = { "unexpected %s, assuming it is closing the parent %s", PM_ERROR_LEVEL_SYNTAX }, @@ -370,7 +392,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_WRITE_TARGET_UNEXPECTED] = { "unexpected write target", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_XSTRING_TERM] = { "expected a closing delimiter for the `%x` or backtick string", PM_ERROR_LEVEL_SYNTAX }, - // Warnings + /* Warnings */ [PM_WARN_AMBIGUOUS_BINARY_OPERATOR] = { "'%s' after local variable or literal is interpreted as binary operator even though it seems like %s", PM_WARNING_LEVEL_VERBOSE }, [PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_MINUS] = { "ambiguous first argument; put parentheses or a space even after `-` operator", PM_WARNING_LEVEL_VERBOSE }, [PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS] = { "ambiguous first argument; put parentheses or a space even after `+` operator", PM_WARNING_LEVEL_VERBOSE }, @@ -406,8 +428,8 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { /** * Get the human-readable name of the given diagnostic ID. */ -const char * -pm_diagnostic_id_human(pm_diagnostic_id_t diag_id) { +static const char * +pm_diagnostic_id_name(pm_diagnostic_id_t diag_id) { switch (diag_id) { <%- errors.each do |error| -%> case PM_ERR_<%= error.name %>: return "<%= error.name.downcase %>"; @@ -421,8 +443,8 @@ pm_diagnostic_id_human(pm_diagnostic_id_t diag_id) { return ""; } -static inline const char * -pm_diagnostic_message(pm_diagnostic_id_t diag_id) { +static PRISM_INLINE const char * +pm_diagnostic_id_message(pm_diagnostic_id_t diag_id) { assert(diag_id < PM_DIAGNOSTIC_ID_MAX); const char *message = diagnostic_messages[diag_id].message; @@ -431,91 +453,102 @@ pm_diagnostic_message(pm_diagnostic_id_t diag_id) { return message; } -static inline uint8_t -pm_diagnostic_level(pm_diagnostic_id_t diag_id) { +static PRISM_INLINE uint8_t +pm_diagnostic_id_level(pm_diagnostic_id_t diag_id) { assert(diag_id < PM_DIAGNOSTIC_ID_MAX); return (uint8_t) diagnostic_messages[diag_id].level; } /** + * Get the type of the given diagnostic. + */ +const char * +pm_diagnostic_type(const pm_diagnostic_t *diagnostic) { + return pm_diagnostic_id_name(diagnostic->diag_id); +} + +/** + * Get the location of the given diagnostic. + */ +pm_location_t +pm_diagnostic_location(const pm_diagnostic_t *diagnostic) { + return diagnostic->location; +} + +/** + * Get the message of the given diagnostic. + */ +const char * +pm_diagnostic_message(const pm_diagnostic_t *diagnostic) { + return diagnostic->message; +} + +/** + * Get the error level associated with the given diagnostic. + */ +pm_error_level_t +pm_diagnostic_error_level(const pm_diagnostic_t *diagnostic) { + return (pm_error_level_t) pm_diagnostic_id_level(diagnostic->diag_id); +} + +/** + * Get the warning level associated with the given diagnostic. + */ +pm_warning_level_t +pm_diagnostic_warning_level(const pm_diagnostic_t *diagnostic) { + return (pm_warning_level_t) pm_diagnostic_id_level(diagnostic->diag_id); +} + +/** * Append an error to the given list of diagnostic. */ -bool -pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) { - pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) xcalloc(1, sizeof(pm_diagnostic_t)); - if (diagnostic == NULL) return false; +void +pm_diagnostic_list_append(pm_arena_t *arena, pm_list_t *list, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id) { + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) pm_arena_zalloc(arena, sizeof(pm_diagnostic_t), PRISM_ALIGNOF(pm_diagnostic_t)); *diagnostic = (pm_diagnostic_t) { - .location = { start, end }, + .location = { .start = start, .length = length }, .diag_id = diag_id, - .message = pm_diagnostic_message(diag_id), - .owned = false, - .level = pm_diagnostic_level(diag_id) + .message = pm_diagnostic_id_message(diag_id), + .level = pm_diagnostic_id_level(diag_id) }; pm_list_append(list, (pm_list_node_t *) diagnostic); - return true; } /** * Append a diagnostic to the given list of diagnostics that is using a format * string for its message. */ -bool -pm_diagnostic_list_append_format(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id, ...) { +void +pm_diagnostic_list_append_format(pm_arena_t *arena, pm_list_t *list, uint32_t start, uint32_t length, pm_diagnostic_id_t diag_id, ...) { va_list arguments; va_start(arguments, diag_id); - const char *format = pm_diagnostic_message(diag_id); + const char *format = pm_diagnostic_id_message(diag_id); int result = vsnprintf(NULL, 0, format, arguments); va_end(arguments); if (result < 0) { - return false; + return; } - pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) xcalloc(1, sizeof(pm_diagnostic_t)); - if (diagnostic == NULL) { - return false; - } + pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) pm_arena_zalloc(arena, sizeof(pm_diagnostic_t), PRISM_ALIGNOF(pm_diagnostic_t)); - size_t length = (size_t) (result + 1); - char *message = (char *) xmalloc(length); - if (message == NULL) { - xfree(diagnostic); - return false; - } + size_t message_length = (size_t) (result + 1); + char *message = (char *) pm_arena_alloc(arena, message_length, 1); va_start(arguments, diag_id); - vsnprintf(message, length, format, arguments); + vsnprintf(message, message_length, format, arguments); va_end(arguments); *diagnostic = (pm_diagnostic_t) { - .location = { start, end }, + .location = { .start = start, .length = length }, .diag_id = diag_id, .message = message, - .owned = true, - .level = pm_diagnostic_level(diag_id) + .level = pm_diagnostic_id_level(diag_id) }; pm_list_append(list, (pm_list_node_t *) diagnostic); - return true; -} - -/** - * Deallocate the internal state of the given diagnostic list. - */ -void -pm_diagnostic_list_free(pm_list_t *list) { - pm_diagnostic_t *node = (pm_diagnostic_t *) list->head; - - while (node != NULL) { - pm_diagnostic_t *next = (pm_diagnostic_t *) node->node.next; - - if (node->owned) xfree((void *) node->message); - xfree(node); - - node = next; - } } diff --git a/prism/templates/src/json.c.erb b/prism/templates/src/json.c.erb new file mode 100644 index 0000000000..5c4ab8d92a --- /dev/null +++ b/prism/templates/src/json.c.erb @@ -0,0 +1,130 @@ +#include "prism/json.h" + +// Ensure this translation unit is never empty, even when JSON is excluded. +typedef int pm_json_unused_t; + +#ifndef PRISM_EXCLUDE_JSON + +#include "prism/internal/buffer.h" +#include "prism/internal/constant_pool.h" +#include "prism/internal/integer.h" +#include "prism/internal/parser.h" + +#include <inttypes.h> + +static void +pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) { + const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, constant->start, constant->length, PM_BUFFER_ESCAPING_JSON); + pm_buffer_append_byte(buffer, '"'); +} + +static void +pm_dump_json_location(pm_buffer_t *buffer, const pm_location_t *location) { + pm_buffer_append_format(buffer, "{\"start\":%" PRIu32 ",\"length\":%" PRIu32 "}", location->start, location->length); +} + +/** + * Dump JSON to the given buffer. + */ +void +pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node) { + switch (PM_NODE_TYPE(node)) { + <%- nodes.each do |node| -%> + case <%= node.type %>: { + pm_buffer_append_string(buffer, "{\"type\":\"<%= node.name %>\",\"location\":", <%= node.name.bytesize + 22 %>); + + const pm_<%= node.human %>_t *cast = (const pm_<%= node.human %>_t *) node; + pm_dump_json_location(buffer, &cast->base.location); + <%- [*node.flags, *node.fields].each_with_index do |field, index| -%> + + // Dump the <%= field.name %> field + pm_buffer_append_byte(buffer, ','); + <%- if field.is_a?(Prism::Template::Flags) -%> + pm_buffer_append_string(buffer, "\"flags\":", 8); + <%- else -%> + pm_buffer_append_string(buffer, "\"<%= field.name %>\":", <%= field.name.bytesize + 3 %>); + <%- end -%> + <%- case field -%> + <%- when Prism::Template::NodeField -%> + pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); + <%- when Prism::Template::OptionalNodeField -%> + if (cast-><%= field.name %> != NULL) { + pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::NodeListField -%> + const pm_node_list_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '['); + + for (size_t index = 0; index < <%= field.name %>->size; index++) { + if (index != 0) pm_buffer_append_byte(buffer, ','); + pm_dump_json(buffer, parser, <%= field.name %>->nodes[index]); + } + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::StringField -%> + const pm_string_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '"'); + pm_buffer_append_source(buffer, pm_string_source(<%= field.name %>), pm_string_length(<%= field.name %>), PM_BUFFER_ESCAPING_JSON); + pm_buffer_append_byte(buffer, '"'); + <%- when Prism::Template::ConstantField -%> + pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); + <%- when Prism::Template::OptionalConstantField -%> + if (cast-><%= field.name %> != PM_CONSTANT_ID_UNSET) { + pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::ConstantListField -%> + const pm_constant_id_list_t *<%= field.name %> = &cast-><%= field.name %>; + pm_buffer_append_byte(buffer, '['); + + for (size_t index = 0; index < <%= field.name %>->size; index++) { + if (index != 0) pm_buffer_append_byte(buffer, ','); + pm_dump_json_constant(buffer, parser, <%= field.name %>->ids[index]); + } + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::LocationField -%> + pm_dump_json_location(buffer, &cast-><%= field.name %>); + <%- when Prism::Template::OptionalLocationField -%> + if (cast-><%= field.name %>.length != 0) { + pm_dump_json_location(buffer, &cast-><%= field.name %>); + } else { + pm_buffer_append_string(buffer, "null", 4); + } + <%- when Prism::Template::UInt8Field -%> + pm_buffer_append_format(buffer, "%" PRIu8, cast-><%= field.name %>); + <%- when Prism::Template::UInt32Field -%> + pm_buffer_append_format(buffer, "%" PRIu32, cast-><%= field.name %>); + <%- when Prism::Template::Flags -%> + size_t flags = 0; + pm_buffer_append_byte(buffer, '['); + <%- node.flags.values.each_with_index do |value, index| -%> + if (PM_NODE_FLAG_P(cast, PM_<%= node.flags.human.upcase %>_<%= value.name %>)) { + if (flags != 0) pm_buffer_append_byte(buffer, ','); + pm_buffer_append_string(buffer, "\"<%= value.name %>\"", <%= value.name.bytesize + 2 %>); + flags++; + } + <%- end -%> + pm_buffer_append_byte(buffer, ']'); + <%- when Prism::Template::IntegerField -%> + pm_integer_string(buffer, &cast-><%= field.name %>); + <%- when Prism::Template::DoubleField -%> + pm_buffer_append_format(buffer, "%f", cast-><%= field.name %>); + <%- else -%> + <%- raise %> + <%- end -%> + <%- end -%> + + pm_buffer_append_byte(buffer, '}'); + break; + } + <%- end -%> + case PM_SCOPE_NODE: + break; + } +} + +#endif diff --git a/prism/templates/src/node.c.erb b/prism/templates/src/node.c.erb index 2357e55200..f51aff6e53 100644 --- a/prism/templates/src/node.c.erb +++ b/prism/templates/src/node.c.erb @@ -1,153 +1,85 @@ #line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" -#include "prism/node.h" +#include "prism/internal/node.h" + +#include "prism/internal/arena.h" + +#include <stdlib.h> /** * Attempts to grow the node list to the next size. If there is already - * capacity in the list, this function does nothing. Otherwise it reallocates - * the list to be twice as large as it was before. If the reallocation fails, - * this function returns false, otherwise it returns true. + * capacity in the list, this function does nothing. Otherwise it allocates a + * new array from the arena (abandon-and-copy strategy) and copies the existing + * data into it. */ -static bool -pm_node_list_grow(pm_node_list_t *list, size_t size) { +static void +pm_node_list_grow(pm_arena_t *arena, pm_node_list_t *list, size_t size) { size_t requested_size = list->size + size; - // If the requested size caused overflow, return false. - if (requested_size < list->size) return false; + // Guard against overflow on the addition. + if (requested_size < list->size) abort(); - // If the requested size is within the existing capacity, return true. - if (requested_size < list->capacity) return true; + // If the requested size is within the existing capacity, return. + if (requested_size <= list->capacity) return; - // Otherwise, reallocate the list to be twice as large as it was before. + // Otherwise, compute the next capacity by doubling. size_t next_capacity = list->capacity == 0 ? 4 : list->capacity * 2; - // If multiplying by 2 caused overflow, return false. - if (next_capacity < list->capacity) return false; - - // If we didn't get enough by doubling, keep doubling until we do. + // Guard against overflow on the doubling. while (requested_size > next_capacity) { - size_t double_capacity = next_capacity * 2; - - // Ensure we didn't overflow by multiplying by 2. - if (double_capacity < next_capacity) return false; - next_capacity = double_capacity; + if (next_capacity == 0) abort(); + next_capacity *= 2; } - pm_node_t **nodes = (pm_node_t **) xrealloc(list->nodes, sizeof(pm_node_t *) * next_capacity); - if (nodes == NULL) return false; + // Allocate a new array from the arena (old array is abandoned). + pm_node_t **nodes = (pm_node_t **) pm_arena_alloc(arena, sizeof(pm_node_t *) * next_capacity, PRISM_ALIGNOF(pm_node_t *)); + + // Copy old data into the new array. + if (list->size > 0) { + memcpy(nodes, list->nodes, list->size * sizeof(pm_node_t *)); + } list->nodes = nodes; list->capacity = next_capacity; - return true; } /** - * Append a new node onto the end of the node list. + * Slow path for pm_node_list_append: grow the list and append the node. + * Do not call directly - use pm_node_list_append instead. */ void -pm_node_list_append(pm_node_list_t *list, pm_node_t *node) { - if (pm_node_list_grow(list, 1)) { - list->nodes[list->size++] = node; - } +pm_node_list_append_slow(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) { + pm_node_list_grow(arena, list, 1); + list->nodes[list->size++] = node; } /** * Prepend a new node onto the beginning of the node list. */ void -pm_node_list_prepend(pm_node_list_t *list, pm_node_t *node) { - if (pm_node_list_grow(list, 1)) { - memmove(list->nodes + 1, list->nodes, list->size * sizeof(pm_node_t *)); - list->nodes[0] = node; - list->size++; - } +pm_node_list_prepend(pm_arena_t *arena, pm_node_list_t *list, pm_node_t *node) { + pm_node_list_grow(arena, list, 1); + memmove(list->nodes + 1, list->nodes, list->size * sizeof(pm_node_t *)); + list->nodes[0] = node; + list->size++; } /** * Concatenate the given node list onto the end of the other node list. */ void -pm_node_list_concat(pm_node_list_t *list, pm_node_list_t *other) { - if (other->size > 0 && pm_node_list_grow(list, other->size)) { +pm_node_list_concat(pm_arena_t *arena, pm_node_list_t *list, pm_node_list_t *other) { + if (other->size > 0) { + pm_node_list_grow(arena, list, other->size); memcpy(list->nodes + list->size, other->nodes, other->size * sizeof(pm_node_t *)); list->size += other->size; } } /** - * Free the internal memory associated with the given node list. - */ -void -pm_node_list_free(pm_node_list_t *list) { - if (list->capacity > 0) { - xfree(list->nodes); - *list = (pm_node_list_t) { 0 }; - } -} - -PRISM_EXPORTED_FUNCTION void -pm_node_destroy(pm_parser_t *parser, pm_node_t *node); - -/** - * Destroy the nodes that are contained within the given node list. - */ -static void -pm_node_list_destroy(pm_parser_t *parser, pm_node_list_t *list) { - pm_node_t *node; - PM_NODE_LIST_FOREACH(list, index, node) pm_node_destroy(parser, node); - pm_node_list_free(list); -} - -/** - * Deallocate the space for a pm_node_t. Similarly to pm_node_alloc, we're not - * using the parser argument, but it's there to allow for the future possibility - * of pre-allocating larger memory pools. - */ -PRISM_EXPORTED_FUNCTION void -pm_node_destroy(pm_parser_t *parser, pm_node_t *node) { - switch (PM_NODE_TYPE(node)) { - <%- nodes.each do |node| -%> -#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" - case <%= node.type %>: { - <%- if node.fields.any? { |field| ![Prism::Template::LocationField, Prism::Template::OptionalLocationField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::ConstantField, Prism::Template::OptionalConstantField, Prism::Template::DoubleField].include?(field.class) } -%> - pm_<%= node.human %>_t *cast = (pm_<%= node.human %>_t *) node; - <%- end -%> - <%- node.fields.each do |field| -%> - <%- case field -%> - <%- when Prism::Template::LocationField, Prism::Template::OptionalLocationField, Prism::Template::UInt8Field, Prism::Template::UInt32Field, Prism::Template::ConstantField, Prism::Template::OptionalConstantField, Prism::Template::DoubleField -%> - <%- when Prism::Template::NodeField -%> - pm_node_destroy(parser, (pm_node_t *)cast-><%= field.name %>); - <%- when Prism::Template::OptionalNodeField -%> - if (cast-><%= field.name %> != NULL) { - pm_node_destroy(parser, (pm_node_t *)cast-><%= field.name %>); - } - <%- when Prism::Template::StringField -%> - pm_string_free(&cast-><%= field.name %>); - <%- when Prism::Template::NodeListField -%> - pm_node_list_destroy(parser, &cast-><%= field.name %>); - <%- when Prism::Template::ConstantListField -%> - pm_constant_id_list_free(&cast-><%= field.name %>); - <%- when Prism::Template::IntegerField -%> - pm_integer_free(&cast-><%= field.name %>); - <%- else -%> - <%- raise -%> - <%- end -%> - <%- end -%> - break; - } - <%- end -%> -#line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" - default: - assert(false && "unreachable"); - break; - } - xfree(node); -} - -/** * Returns a string representation of the given node type. */ -PRISM_EXPORTED_FUNCTION const char * -pm_node_type_to_str(pm_node_type_t node_type) +const char * +pm_node_type(pm_node_type_t node_type) { switch (node_type) { <%- nodes.each do |node| -%> @@ -166,7 +98,7 @@ pm_node_type_to_str(pm_node_type_t node_type) * pointer and is passed to the visitor callback for consumers to use as they * see fit. */ -PRISM_EXPORTED_FUNCTION void +void pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) { if (visitor(node, data)) pm_visit_child_nodes(node, visitor, data); } @@ -176,7 +108,7 @@ pm_visit_node(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void * default behavior for walking the tree that is called from pm_visit_node if * the callback returns true. */ -PRISM_EXPORTED_FUNCTION void +void pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *node, void *data), void *data) { switch (PM_NODE_TYPE(node)) { <%- nodes.each do |node| -%> @@ -212,122 +144,23 @@ pm_visit_child_nodes(const pm_node_t *node, bool (*visitor)(const pm_node_t *nod break; } } +<%- nodes.each do |node| -%> -// We optionally support dumping to JSON. For systems that don't want or need -// this functionality, it can be turned off with the PRISM_EXCLUDE_JSON define. -#ifndef PRISM_EXCLUDE_JSON - -static void -pm_dump_json_constant(pm_buffer_t *buffer, const pm_parser_t *parser, pm_constant_id_t constant_id) { - const pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); - pm_buffer_append_byte(buffer, '"'); - pm_buffer_append_source(buffer, constant->start, constant->length, PM_BUFFER_ESCAPING_JSON); - pm_buffer_append_byte(buffer, '"'); -} - -static void -pm_dump_json_location(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_location_t *location) { - uint32_t start = (uint32_t) (location->start - parser->start); - uint32_t end = (uint32_t) (location->end - parser->start); - pm_buffer_append_format(buffer, "{\"start\":%" PRIu32 ",\"end\":%" PRIu32 "}", start, end); -} - +<%- params = node.fields.map(&:c_param) -%> /** - * Dump JSON to the given buffer. + * Allocate and initialize a new <%= node.name %> node. */ -PRISM_EXPORTED_FUNCTION void -pm_dump_json(pm_buffer_t *buffer, const pm_parser_t *parser, const pm_node_t *node) { - switch (PM_NODE_TYPE(node)) { - <%- nodes.each do |node| -%> - case <%= node.type %>: { - pm_buffer_append_string(buffer, "{\"type\":\"<%= node.name %>\",\"location\":", <%= node.name.bytesize + 22 %>); - - const pm_<%= node.human %>_t *cast = (const pm_<%= node.human %>_t *) node; - pm_dump_json_location(buffer, parser, &cast->base.location); - <%- [*node.flags, *node.fields].each_with_index do |field, index| -%> - - // Dump the <%= field.name %> field - pm_buffer_append_byte(buffer, ','); - pm_buffer_append_string(buffer, "\"<%= field.name %>\":", <%= field.name.bytesize + 3 %>); - <%- case field -%> - <%- when Prism::Template::NodeField -%> - pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); - <%- when Prism::Template::OptionalNodeField -%> - if (cast-><%= field.name %> != NULL) { - pm_dump_json(buffer, parser, (const pm_node_t *) cast-><%= field.name %>); - } else { - pm_buffer_append_string(buffer, "null", 4); - } - <%- when Prism::Template::NodeListField -%> - const pm_node_list_t *<%= field.name %> = &cast-><%= field.name %>; - pm_buffer_append_byte(buffer, '['); - - for (size_t index = 0; index < <%= field.name %>->size; index++) { - if (index != 0) pm_buffer_append_byte(buffer, ','); - pm_dump_json(buffer, parser, <%= field.name %>->nodes[index]); - } - pm_buffer_append_byte(buffer, ']'); - <%- when Prism::Template::StringField -%> - const pm_string_t *<%= field.name %> = &cast-><%= field.name %>; - pm_buffer_append_byte(buffer, '"'); - pm_buffer_append_source(buffer, pm_string_source(<%= field.name %>), pm_string_length(<%= field.name %>), PM_BUFFER_ESCAPING_JSON); - pm_buffer_append_byte(buffer, '"'); - <%- when Prism::Template::ConstantField -%> - pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); - <%- when Prism::Template::OptionalConstantField -%> - if (cast-><%= field.name %> != PM_CONSTANT_ID_UNSET) { - pm_dump_json_constant(buffer, parser, cast-><%= field.name %>); - } else { - pm_buffer_append_string(buffer, "null", 4); - } - <%- when Prism::Template::ConstantListField -%> - const pm_constant_id_list_t *<%= field.name %> = &cast-><%= field.name %>; - pm_buffer_append_byte(buffer, '['); - - for (size_t index = 0; index < <%= field.name %>->size; index++) { - if (index != 0) pm_buffer_append_byte(buffer, ','); - pm_dump_json_constant(buffer, parser, <%= field.name %>->ids[index]); - } - pm_buffer_append_byte(buffer, ']'); - <%- when Prism::Template::LocationField -%> - pm_dump_json_location(buffer, parser, &cast-><%= field.name %>); - <%- when Prism::Template::OptionalLocationField -%> - if (cast-><%= field.name %>.start != NULL) { - pm_dump_json_location(buffer, parser, &cast-><%= field.name %>); - } else { - pm_buffer_append_string(buffer, "null", 4); - } - <%- when Prism::Template::UInt8Field -%> - pm_buffer_append_format(buffer, "%" PRIu8, cast-><%= field.name %>); - <%- when Prism::Template::UInt32Field -%> - pm_buffer_append_format(buffer, "%" PRIu32, cast-><%= field.name %>); - <%- when Prism::Template::Flags -%> - size_t flags = 0; - pm_buffer_append_byte(buffer, '['); - <%- node.flags.values.each_with_index do |value, index| -%> - if (PM_NODE_FLAG_P(cast, PM_<%= node.flags.human.upcase %>_<%= value.name %>)) { - if (flags != 0) pm_buffer_append_byte(buffer, ','); - pm_buffer_append_string(buffer, "\"<%= value.name %>\"", <%= value.name.bytesize + 2 %>); - flags++; - } - <%- end -%> - pm_buffer_append_byte(buffer, ']'); - <%- when Prism::Template::IntegerField -%> - pm_integer_string(buffer, &cast-><%= field.name %>); - <%- when Prism::Template::DoubleField -%> - pm_buffer_append_format(buffer, "%f", cast-><%= field.name %>); - <%- else -%> - <%- raise %> - <%- end -%> - <%- end -%> +pm_<%= node.human %>_t * +pm_<%= node.human %>_new(pm_arena_t *arena, uint32_t node_id, pm_node_flags_t flags, pm_location_t location<%= params.empty? ? "" : ", #{params.join(", ")}" %>) { + pm_<%= node.human %>_t *node = (pm_<%= node.human %>_t *) pm_arena_alloc(arena, sizeof(pm_<%= node.human %>_t), PRISM_ALIGNOF(pm_<%= node.human %>_t)); + + *node = (pm_<%= node.human %>_t) { + .base = { .type = <%= node.type %>, .flags = flags, .node_id = node_id, .location = location }<%= node.fields.empty? ? "" : "," %> +<%- node.fields.each_with_index do |field, index| -%> + .<%= field.name %> = <%= field.name %><%= index < node.fields.size - 1 ? "," : "" %> +<%- end -%> + }; - pm_buffer_append_byte(buffer, '}'); - break; - } - <%- end -%> - case PM_SCOPE_NODE: - break; - } + return node; } - -#endif +<%- end -%> diff --git a/prism/templates/src/prettyprint.c.erb b/prism/templates/src/prettyprint.c.erb index 639c2fecf3..f12531d934 100644 --- a/prism/templates/src/prettyprint.c.erb +++ b/prism/templates/src/prettyprint.c.erb @@ -1,23 +1,34 @@ <%# encoding: ASCII -%> #include "prism/prettyprint.h" -// We optionally support pretty printing nodes. For systems that don't want or -// need this functionality, it can be turned off with the -// PRISM_EXCLUDE_PRETTYPRINT define. +/* We optionally support pretty printing nodes. For systems that don't want or + * need this functionality, it can be turned off with the + * PRISM_EXCLUDE_PRETTYPRINT define. */ #ifdef PRISM_EXCLUDE_PRETTYPRINT -void pm_prettyprint(void) {} +/* Ensure this translation unit is never empty, even when prettyprint is + * excluded. */ +typedef int pm_prettyprint_unused_t; #else -static inline void +#include "prism/compiler/inline.h" +#include "prism/internal/buffer.h" +#include "prism/internal/constant_pool.h" +#include "prism/internal/integer.h" +#include "prism/internal/parser.h" +#include "prism/line_offset_list.h" + +#include <inttypes.h> + +static PRISM_INLINE void prettyprint_location(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_location_t *location) { - pm_line_column_t start = pm_newline_list_line_column(&parser->newline_list, location->start, parser->start_line); - pm_line_column_t end = pm_newline_list_line_column(&parser->newline_list, location->end, parser->start_line); + pm_line_column_t start = pm_line_offset_list_line_column(&parser->line_offsets, location->start, parser->start_line); + pm_line_column_t end = pm_line_offset_list_line_column(&parser->line_offsets, location->start + location->length, parser->start_line); pm_buffer_append_format(output_buffer, "(%" PRIi32 ",%" PRIu32 ")-(%" PRIi32 ",%" PRIu32 ")", start.line, start.column, end.line, end.column); } -static inline void +static PRISM_INLINE void prettyprint_constant(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_constant_id_t constant_id) { pm_constant_t *constant = pm_constant_pool_id_to_constant(&parser->constant_pool, constant_id); pm_buffer_append_format(output_buffer, ":%.*s", (int) constant->length, constant->start); @@ -106,17 +117,17 @@ prettyprint_node(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm pm_buffer_append_byte(output_buffer, ' '); prettyprint_location(output_buffer, parser, location); pm_buffer_append_string(output_buffer, " = \"", 4); - pm_buffer_append_source(output_buffer, location->start, (size_t) (location->end - location->start), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_source(output_buffer, parser->start + location->start, (size_t) location->length, PM_BUFFER_ESCAPING_RUBY); pm_buffer_append_string(output_buffer, "\"\n", 2); <%- when Prism::Template::OptionalLocationField -%> pm_location_t *location = &cast-><%= field.name %>; - if (location->start == NULL) { + if (location->length == 0) { pm_buffer_append_string(output_buffer, " nil\n", 5); } else { pm_buffer_append_byte(output_buffer, ' '); prettyprint_location(output_buffer, parser, location); pm_buffer_append_string(output_buffer, " = \"", 4); - pm_buffer_append_source(output_buffer, location->start, (size_t) (location->end - location->start), PM_BUFFER_ESCAPING_RUBY); + pm_buffer_append_source(output_buffer, parser->start + location->start, (size_t) location->length, PM_BUFFER_ESCAPING_RUBY); pm_buffer_append_string(output_buffer, "\"\n", 2); } <%- when Prism::Template::UInt8Field -%> @@ -156,11 +167,11 @@ prettyprint_node(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm /** * Pretty-prints the AST represented by the given node to the given buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_prettyprint(pm_buffer_t *output_buffer, const pm_parser_t *parser, const pm_node_t *node) { pm_buffer_t prefix_buffer = { 0 }; prettyprint_node(output_buffer, parser, node, &prefix_buffer); - pm_buffer_free(&prefix_buffer); + pm_buffer_cleanup(&prefix_buffer); } #endif diff --git a/prism/templates/src/serialize.c.erb b/prism/templates/src/serialize.c.erb index 3e15a11039..3d9811e5db 100644 --- a/prism/templates/src/serialize.c.erb +++ b/prism/templates/src/serialize.c.erb @@ -1,57 +1,58 @@ -#include "prism.h" +#include "prism/excludes.h" + +/* We optionally support serializing to a binary string. For systems that do not + * want or need this functionality, it can be turned off with the + * PRISM_EXCLUDE_SERIALIZATION define. */ +#ifdef PRISM_EXCLUDE_SERIALIZATION + +/* Ensure this translation unit is never empty, even when serialization is + * excluded. */ +typedef int pm_serialize_unused_t; + +#else + +#include "prism/compiler/inline.h" -// We optionally support serializing to a binary string. For systems that don't -// want or need this functionality, it can be turned off with the -// PRISM_EXCLUDE_SERIALIZATION define. -#ifndef PRISM_EXCLUDE_SERIALIZATION +#include "prism/internal/buffer.h" +#include "prism/internal/comments.h" +#include "prism/internal/diagnostic.h" +#include "prism/internal/encoding.h" +#include "prism/internal/list.h" +#include "prism/internal/magic_comments.h" +#include "prism/internal/options.h" +#include "prism/internal/parser.h" +#include "prism.h" +#include "prism/ast.h" +#include "prism/line_offset_list.h" + +#include <assert.h> #include <stdio.h> +#include <string.h> -static inline uint32_t +static PRISM_INLINE uint32_t pm_ptrdifft_to_u32(ptrdiff_t value) { assert(value >= 0 && ((unsigned long) value) < UINT32_MAX); return (uint32_t) value; } -static inline uint32_t +static PRISM_INLINE uint32_t pm_sizet_to_u32(size_t value) { assert(value < UINT32_MAX); return (uint32_t) value; } static void -pm_serialize_location(const pm_parser_t *parser, const pm_location_t *location, pm_buffer_t *buffer) { - assert(location->start); - assert(location->end); - assert(location->start <= location->end); - - pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(location->start - parser->start)); - pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(location->end - location->start)); +pm_serialize_location(const pm_location_t *location, pm_buffer_t *buffer) { + pm_buffer_append_varuint(buffer, location->start); + pm_buffer_append_varuint(buffer, location->length); } static void -pm_serialize_string(const pm_parser_t *parser, const pm_string_t *string, pm_buffer_t *buffer) { - switch (string->type) { - case PM_STRING_SHARED: { - pm_buffer_append_byte(buffer, 1); - pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(pm_string_source(string) - parser->start)); - pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_string_length(string))); - break; - } - case PM_STRING_OWNED: - case PM_STRING_CONSTANT: { - uint32_t length = pm_sizet_to_u32(pm_string_length(string)); - pm_buffer_append_byte(buffer, 2); - pm_buffer_append_varuint(buffer, length); - pm_buffer_append_bytes(buffer, pm_string_source(string), length); - break; - } -#ifdef PRISM_HAS_MMAP - case PM_STRING_MAPPED: - assert(false && "Cannot serialize mapped strings."); - break; -#endif - } +pm_serialize_string(const pm_string_t *string, pm_buffer_t *buffer) { + uint32_t length = pm_sizet_to_u32(pm_string_length(string)); + pm_buffer_append_varuint(buffer, length); + pm_buffer_append_bytes(buffer, pm_string_source(string), length); } static void @@ -72,12 +73,10 @@ static void pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { pm_buffer_append_byte(buffer, (uint8_t) PM_NODE_TYPE(node)); - size_t offset = buffer->length; - <%- if Prism::Template::INCLUDE_NODE_ID -%> pm_buffer_append_varuint(buffer, node->node_id); <%- end -%> - pm_serialize_location(parser, &node->location, buffer); + pm_serialize_location(&node->location, buffer); switch (PM_NODE_TYPE(node)) { // We do not need to serialize a ScopeNode ever as @@ -106,7 +105,7 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { pm_serialize_node(parser, (pm_node_t *)((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); } <%- when Prism::Template::StringField -%> - pm_serialize_string(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + pm_serialize_string(&((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); <%- when Prism::Template::NodeListField -%> uint32_t <%= field.name %>_size = pm_sizet_to_u32(((pm_<%= node.human %>_t *)node)-><%= field.name %>.size); pm_buffer_append_varuint(buffer, <%= field.name %>_size); @@ -123,15 +122,15 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { } <%- when Prism::Template::LocationField -%> <%- if field.should_be_serialized? -%> - pm_serialize_location(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + pm_serialize_location(&((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); <%- end -%> <%- when Prism::Template::OptionalLocationField -%> <%- if field.should_be_serialized? -%> - if (((pm_<%= node.human %>_t *)node)-><%= field.name %>.start == NULL) { + if (((pm_<%= node.human %>_t *)node)-><%= field.name %>.length == 0) { pm_buffer_append_byte(buffer, 0); } else { pm_buffer_append_byte(buffer, 1); - pm_serialize_location(parser, &((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); + pm_serialize_location(&((pm_<%= node.human %>_t *)node)-><%= field.name %>, buffer); } <%- end -%> <%- when Prism::Template::UInt8Field -%> @@ -148,7 +147,7 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { <%- end -%> <%- if node.needs_serialized_length? -%> // serialize length - uint32_t length = pm_sizet_to_u32(buffer->length - offset - sizeof(uint32_t)); + uint32_t length = pm_sizet_to_u32(buffer->length - length_offset); memcpy(buffer->value + length_offset, &length, sizeof(uint32_t)); <%- end -%> break; @@ -158,7 +157,7 @@ pm_serialize_node(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) { } static void -pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) { +pm_serialize_line_offset_list(pm_line_offset_list_t *list, pm_buffer_t *buffer) { uint32_t size = pm_sizet_to_u32(list->size); pm_buffer_append_varuint(buffer, size); @@ -169,60 +168,60 @@ pm_serialize_newline_list(pm_newline_list_t *list, pm_buffer_t *buffer) { } static void -pm_serialize_comment(pm_parser_t *parser, pm_comment_t *comment, pm_buffer_t *buffer) { +pm_serialize_comment(pm_comment_t *comment, pm_buffer_t *buffer) { // serialize type pm_buffer_append_byte(buffer, (uint8_t) comment->type); // serialize location - pm_serialize_location(parser, &comment->location, buffer); + pm_serialize_location(&comment->location, buffer); } /** * Serialize the given list of comments to the given buffer. */ void -pm_serialize_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { +pm_serialize_comment_list(pm_list_t *list, pm_buffer_t *buffer) { pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); pm_comment_t *comment; for (comment = (pm_comment_t *) list->head; comment != NULL; comment = (pm_comment_t *) comment->node.next) { - pm_serialize_comment(parser, comment, buffer); + pm_serialize_comment(comment, buffer); } } static void -pm_serialize_magic_comment(pm_parser_t *parser, pm_magic_comment_t *magic_comment, pm_buffer_t *buffer) { +pm_serialize_magic_comment(pm_magic_comment_t *magic_comment, pm_buffer_t *buffer) { // serialize key location - pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(magic_comment->key_start - parser->start)); - pm_buffer_append_varuint(buffer, pm_sizet_to_u32(magic_comment->key_length)); + pm_buffer_append_varuint(buffer, magic_comment->key.start); + pm_buffer_append_varuint(buffer, magic_comment->key.length); // serialize value location - pm_buffer_append_varuint(buffer, pm_ptrdifft_to_u32(magic_comment->value_start - parser->start)); - pm_buffer_append_varuint(buffer, pm_sizet_to_u32(magic_comment->value_length)); + pm_buffer_append_varuint(buffer, magic_comment->value.start); + pm_buffer_append_varuint(buffer, magic_comment->value.length); } static void -pm_serialize_magic_comment_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { +pm_serialize_magic_comment_list(pm_list_t *list, pm_buffer_t *buffer) { pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); pm_magic_comment_t *magic_comment; for (magic_comment = (pm_magic_comment_t *) list->head; magic_comment != NULL; magic_comment = (pm_magic_comment_t *) magic_comment->node.next) { - pm_serialize_magic_comment(parser, magic_comment, buffer); + pm_serialize_magic_comment(magic_comment, buffer); } } static void pm_serialize_data_loc(const pm_parser_t *parser, pm_buffer_t *buffer) { - if (parser->data_loc.end == NULL) { + if (parser->data_loc.length == 0) { pm_buffer_append_byte(buffer, 0); } else { pm_buffer_append_byte(buffer, 1); - pm_serialize_location(parser, &parser->data_loc, buffer); + pm_serialize_location(&parser->data_loc, buffer); } } static void -pm_serialize_diagnostic(pm_parser_t *parser, pm_diagnostic_t *diagnostic, pm_buffer_t *buffer) { +pm_serialize_diagnostic(pm_diagnostic_t *diagnostic, pm_buffer_t *buffer) { // serialize the type pm_buffer_append_varuint(buffer, (uint32_t) diagnostic->diag_id); @@ -232,18 +231,18 @@ pm_serialize_diagnostic(pm_parser_t *parser, pm_diagnostic_t *diagnostic, pm_buf pm_buffer_append_string(buffer, diagnostic->message, message_length); // serialize location - pm_serialize_location(parser, &diagnostic->location, buffer); + pm_serialize_location(&diagnostic->location, buffer); pm_buffer_append_byte(buffer, diagnostic->level); } static void -pm_serialize_diagnostic_list(pm_parser_t *parser, pm_list_t *list, pm_buffer_t *buffer) { +pm_serialize_diagnostic_list(pm_list_t *list, pm_buffer_t *buffer) { pm_buffer_append_varuint(buffer, pm_sizet_to_u32(pm_list_size(list))); pm_diagnostic_t *diagnostic; for (diagnostic = (pm_diagnostic_t *) list->head; diagnostic != NULL; diagnostic = (pm_diagnostic_t *) diagnostic->node.next) { - pm_serialize_diagnostic(parser, diagnostic, buffer); + pm_serialize_diagnostic(diagnostic, buffer); } } @@ -261,14 +260,15 @@ static void pm_serialize_metadata(pm_parser_t *parser, pm_buffer_t *buffer) { pm_serialize_encoding(parser->encoding, buffer); pm_buffer_append_varsint(buffer, parser->start_line); - pm_serialize_newline_list(&parser->newline_list, buffer); + pm_serialize_line_offset_list(&parser->line_offsets, buffer); <%- unless Prism::Template::SERIALIZE_ONLY_SEMANTICS_FIELDS -%> - pm_serialize_comment_list(parser, &parser->comment_list, buffer); + pm_serialize_comment_list(&parser->comment_list, buffer); <%- end -%> - pm_serialize_magic_comment_list(parser, &parser->magic_comment_list, buffer); + pm_serialize_magic_comment_list(&parser->magic_comment_list, buffer); pm_serialize_data_loc(parser, buffer); - pm_serialize_diagnostic_list(parser, &parser->error_list, buffer); - pm_serialize_diagnostic_list(parser, &parser->warning_list, buffer); + pm_serialize_diagnostic_list(&parser->error_list, buffer); + pm_serialize_diagnostic_list(&parser->warning_list, buffer); + pm_buffer_append_byte(buffer, (uint8_t) parser->continuable); } #line <%= __LINE__ + 1 %> "prism/templates/src/<%= File.basename(__FILE__) %>" @@ -308,28 +308,12 @@ pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) pm_constant_t *constant = &parser->constant_pool.constants[bucket->id - 1]; size_t buffer_offset = offset + ((((size_t)bucket->id) - 1) * 8); - if (bucket->type == PM_CONSTANT_POOL_BUCKET_OWNED || bucket->type == PM_CONSTANT_POOL_BUCKET_CONSTANT) { - // Since this is an owned or constant constant, we are going to - // write its contents into the buffer after the constant pool. - // So effectively in place of the source offset, we have a - // buffer offset. We will add a leading 1 to indicate that this - // is a buffer offset. - uint32_t content_offset = pm_sizet_to_u32(buffer->length); - uint32_t owned_mask = (uint32_t) (1 << 31); + // Write the constant contents into the buffer after the constant + // pool. In place of the source offset, we store a buffer offset. + uint32_t content_offset = pm_sizet_to_u32(buffer->length); + memcpy(buffer->value + buffer_offset, &content_offset, 4); + pm_buffer_append_bytes(buffer, constant->start, constant->length); - assert(content_offset < owned_mask); - content_offset |= owned_mask; - - memcpy(buffer->value + buffer_offset, &content_offset, 4); - pm_buffer_append_bytes(buffer, constant->start, constant->length); - } else { - // Since this is a shared constant, we are going to write its - // source offset directly into the buffer. - uint32_t source_offset = pm_ptrdifft_to_u32(constant->start - parser->start); - memcpy(buffer->value + buffer_offset, &source_offset, 4); - } - - // Now we can write the length of the constant into the buffer. uint32_t constant_length = pm_sizet_to_u32(constant->length); memcpy(buffer->value + buffer_offset + 4, &constant_length, 4); } @@ -337,7 +321,7 @@ pm_serialize_content(pm_parser_t *parser, pm_node_t *node, pm_buffer_t *buffer) } static void -serialize_token(void *data, pm_parser_t *parser, pm_token_t *token) { +serialize_token(pm_parser_t *parser, pm_token_t *token, void *data) { pm_buffer_t *buffer = (pm_buffer_t *) data; pm_buffer_append_varuint(buffer, token->type); @@ -349,58 +333,72 @@ serialize_token(void *data, pm_parser_t *parser, pm_token_t *token) { /** * Lex the given source and serialize to the given buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_serialize_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { pm_options_t options = { 0 }; pm_options_read(&options, data); + pm_arena_t arena = { 0 }; pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); + pm_parser_init(&arena, &parser, source, size, &options); - pm_lex_callback_t lex_callback = (pm_lex_callback_t) { - .data = (void *) buffer, - .callback = serialize_token, - }; - - parser.lex_callback = &lex_callback; - pm_node_t *node = pm_parse(&parser); + pm_parser_lex_callback_set(&parser, serialize_token, buffer); + pm_parse(&parser); // Append 0 to mark end of tokens. pm_buffer_append_byte(buffer, 0); pm_serialize_metadata(&parser, buffer); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_cleanup(&parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); } /** * Parse and serialize both the AST and the tokens represented by the given * source to the given buffer. */ -PRISM_EXPORTED_FUNCTION void +void pm_serialize_parse_lex(pm_buffer_t *buffer, const uint8_t *source, size_t size, const char *data) { pm_options_t options = { 0 }; pm_options_read(&options, data); + pm_arena_t arena = { 0 }; pm_parser_t parser; - pm_parser_init(&parser, source, size, &options); - - pm_lex_callback_t lex_callback = (pm_lex_callback_t) { - .data = (void *) buffer, - .callback = serialize_token, - }; + pm_parser_init(&arena, &parser, source, size, &options); - parser.lex_callback = &lex_callback; + pm_parser_lex_callback_set(&parser, serialize_token, buffer); pm_node_t *node = pm_parse(&parser); pm_buffer_append_byte(buffer, 0); pm_serialize(&parser, node, buffer); - pm_node_destroy(&parser, node); - pm_parser_free(&parser); - pm_options_free(&options); + pm_parser_cleanup(&parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); +} + +/** + * Parse the source and return true if it parses without errors or warnings. + */ +bool +pm_serialize_parse_success_p(const uint8_t *source, size_t size, const char *data) { + pm_options_t options = { 0 }; + pm_options_read(&options, data); + + pm_arena_t arena = { 0 }; + pm_parser_t parser; + pm_parser_init(&arena, &parser, source, size, &options); + + pm_parse(&parser); + + bool result = parser.error_list.size == 0; + pm_parser_cleanup(&parser); + pm_arena_cleanup(&arena); + pm_options_cleanup(&options); + + return result; } #endif diff --git a/prism/templates/src/token_type.c.erb b/prism/templates/src/tokens.c.erb index f196393ee1..1e82954738 100644 --- a/prism/templates/src/token_type.c.erb +++ b/prism/templates/src/tokens.c.erb @@ -1,12 +1,12 @@ -#include <string.h> - #include "prism/ast.h" +#include <assert.h> + /** * Returns a string representation of the given token type. */ -PRISM_EXPORTED_FUNCTION const char * -pm_token_type_name(pm_token_type_t token_type) { +const char * +pm_token_type(pm_token_type_t token_type) { switch (token_type) { <%- tokens.each do |token| -%> case PM_TOKEN_<%= token.name %>: @@ -27,14 +27,10 @@ pm_token_type_name(pm_token_type_t token_type) { * Returns the human name of the given token type. */ const char * -pm_token_type_human(pm_token_type_t token_type) { +pm_token_str(pm_token_type_t token_type) { switch (token_type) { case PM_TOKEN_EOF: return "end-of-input"; - case PM_TOKEN_MISSING: - return "missing token"; - case PM_TOKEN_NOT_PROVIDED: - return "not provided token"; case PM_TOKEN_AMPERSAND: return "'&'"; case PM_TOKEN_AMPERSAND_AMPERSAND: @@ -171,6 +167,8 @@ pm_token_type_human(pm_token_type_t token_type) { return "'defined?'"; case PM_TOKEN_KEYWORD_DO: return "'do'"; + case PM_TOKEN_KEYWORD_DO_BLOCK: + return "'do'"; case PM_TOKEN_KEYWORD_DO_LOOP: return "'do'"; case PM_TOKEN_KEYWORD_ELSE: @@ -362,8 +360,8 @@ pm_token_type_human(pm_token_type_t token_type) { return ""; } - // Provide a default, because some compilers can't determine that the above - // switch is exhaustive. + /* Provide a default, because some compilers cannot determine that the above + * switch is exhaustive. */ assert(false && "unreachable"); return ""; } diff --git a/prism/templates/template.rb b/prism/templates/template.rb index 30cb60cabd..0fdeda561f 100755 --- a/prism/templates/template.rb +++ b/prism/templates/template.rb @@ -6,13 +6,12 @@ require "fileutils" require "yaml" module Prism - module Template + module Template # :nodoc: all SERIALIZE_ONLY_SEMANTICS_FIELDS = ENV.fetch("PRISM_SERIALIZE_ONLY_SEMANTICS_FIELDS", false) - REMOVE_ON_ERROR_TYPES = SERIALIZE_ONLY_SEMANTICS_FIELDS CHECK_FIELD_KIND = ENV.fetch("CHECK_FIELD_KIND", false) - JAVA_BACKEND = ENV["PRISM_JAVA_BACKEND"] || "truffleruby" - JAVA_STRING_TYPE = JAVA_BACKEND == "jruby" ? "org.jruby.RubySymbol" : "String" + JAVA_BACKEND = ENV["PRISM_JAVA_BACKEND"] || "default" + JAVA_IDENTIFIER_TYPE = JAVA_BACKEND == "truffleruby" ? "String" : "byte[]" INCLUDE_NODE_ID = !SERIALIZE_ONLY_SEMANTICS_FIELDS || JAVA_BACKEND == "jruby" COMMON_FLAGS_COUNT = 2 @@ -49,6 +48,14 @@ module Prism end end + # This module contains methods for escaping characters in Doxygen comments. + module Doxygen + # Similar to /verbatim ... /endverbatim but doesn't wrap the result in a code block. + def self.verbatim(value) + value.gsub(/[*%!`#<>_+@-]/, '\\\\\0') + end + end + # A comment attached to a field or node. class ConfigComment attr_reader :value @@ -97,6 +104,11 @@ module Prism # Some node fields can be specialized if they point to a specific kind of # node and not just a generic node. class NodeKindField < Field + # The C type to use for this field as a function parameter. + def c_param + "struct #{c_type} *#{name}" + end + def initialize(kind:, **options) @kind = kind super(**options) @@ -142,27 +154,27 @@ module Prism if specific_kind specific_kind elsif union_kind - union_kind.join(" | ") + "(#{union_kind.join(" | ")})" else "Prism::node" end end - def rbi_class + def call_seq_type if specific_kind - "Prism::#{specific_kind}" + specific_kind elsif union_kind - "T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")})" + union_kind.join(" | ") else - "Prism::Node" + "Node" end end def check_field_kind if union_kind - "[#{union_kind.join(', ')}].include?(#{name}.class)" + "[#{union_kind.join(', ')}, ErrorRecoveryNode].include?(#{name}.class)" else - "#{name}.is_a?(#{ruby_type})" + "#{name}.is_a?(#{ruby_type}) || #{name}.is_a?(ErrorRecoveryNode)" end end end @@ -174,27 +186,27 @@ module Prism if specific_kind "#{specific_kind}?" elsif union_kind - [*union_kind, "nil"].join(" | ") + "(#{union_kind.join(" | ")})?" else "Prism::node?" end end - def rbi_class + def call_seq_type if specific_kind - "T.nilable(Prism::#{specific_kind})" + "#{specific_kind} | nil" elsif union_kind - "T.nilable(T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")}))" + [*union_kind, "nil"].join(" | ") else - "T.nilable(Prism::Node)" + "Node | nil" end end def check_field_kind if union_kind - "[#{union_kind.join(', ')}, NilClass].include?(#{name}.class)" + "[#{union_kind.join(', ')}, ErrorRecoveryNode, NilClass].include?(#{name}.class)" else - "#{name}.nil? || #{name}.is_a?(#{ruby_type})" + "#{name}.nil? || #{name}.is_a?(#{ruby_type}) || #{name}.is_a?(ErrorRecoveryNode)" end end end @@ -202,23 +214,31 @@ module Prism # This represents a field on a node that is a list of nodes. We pass them as # references and store them directly on the struct. class NodeListField < NodeKindField - def rbs_class + def c_param + "pm_node_list_t #{name}" + end + + def element_rbs_class if specific_kind - "Array[#{specific_kind}]" + "#{specific_kind}" elsif union_kind - "Array[#{union_kind.join(" | ")}]" + "#{union_kind.join(" | ")}" else - "Array[Prism::node]" + "Prism::node" end end - def rbi_class + def rbs_class + "Array[#{element_rbs_class}]" + end + + def call_seq_type if specific_kind - "T::Array[Prism::#{specific_kind}]" + "Array[#{specific_kind}]" elsif union_kind - "T::Array[T.any(#{union_kind.map { |kind| "Prism::#{kind}" }.join(", ")})]" + "Array[#{union_kind.join(" | ")}]" else - "T::Array[Prism::Node]" + "Array[Node]" end end @@ -228,9 +248,9 @@ module Prism def check_field_kind if union_kind - "#{name}.all? { |n| [#{union_kind.join(', ')}].include?(n.class) }" + "#{name}.all? { |n| [#{union_kind.join(', ')}, ErrorRecoveryNode].include?(n.class) }" else - "#{name}.all? { |n| n.is_a?(#{ruby_type}) }" + "#{name}.all? { |n| n.is_a?(#{ruby_type}) || n.is_a?(ErrorRecoveryNode) }" end end end @@ -238,58 +258,74 @@ module Prism # This represents a field on a node that is the ID of a string interned # through the parser's constant pool. class ConstantField < Field + def c_param + "pm_constant_id_t #{name}" + end + def rbs_class "Symbol" end - def rbi_class + def call_seq_type "Symbol" end def java_type - JAVA_STRING_TYPE + JAVA_IDENTIFIER_TYPE end end # This represents a field on a node that is the ID of a string interned # through the parser's constant pool and can be optionally null. class OptionalConstantField < Field + def c_param + "pm_constant_id_t #{name}" + end + def rbs_class "Symbol?" end - def rbi_class - "T.nilable(Symbol)" + def call_seq_type + "Symbol | nil" end def java_type - JAVA_STRING_TYPE + JAVA_IDENTIFIER_TYPE end end # This represents a field on a node that is a list of IDs that are associated # with strings interned through the parser's constant pool. class ConstantListField < Field + def c_param + "pm_constant_id_list_t #{name}" + end + def rbs_class "Array[Symbol]" end - def rbi_class - "T::Array[Symbol]" + def call_seq_type + "Array[Symbol]" end def java_type - "#{JAVA_STRING_TYPE}[]" + "#{JAVA_IDENTIFIER_TYPE}[]" end end # This represents a field on a node that is a string. class StringField < Field + def c_param + "pm_string_t #{name}" + end + def rbs_class "String" end - def rbi_class + def call_seq_type "String" end @@ -300,6 +336,10 @@ module Prism # This represents a field on a node that is a location. class LocationField < Field + def c_param + "pm_location_t #{name}" + end + def semantic_field? false end @@ -308,8 +348,8 @@ module Prism "Location" end - def rbi_class - "Prism::Location" + def call_seq_type + "Location" end def java_type @@ -319,6 +359,10 @@ module Prism # This represents a field on a node that is a location that is optional. class OptionalLocationField < Field + def c_param + "pm_location_t #{name}" + end + def semantic_field? false end @@ -327,8 +371,8 @@ module Prism "Location?" end - def rbi_class - "T.nilable(Prism::Location)" + def call_seq_type + "Location | nil" end def java_type @@ -338,11 +382,15 @@ module Prism # This represents an integer field. class UInt8Field < Field + def c_param + "uint8_t #{name}" + end + def rbs_class "Integer" end - def rbi_class + def call_seq_type "Integer" end @@ -353,11 +401,15 @@ module Prism # This represents an integer field. class UInt32Field < Field + def c_param + "uint32_t #{name}" + end + def rbs_class "Integer" end - def rbi_class + def call_seq_type "Integer" end @@ -369,11 +421,15 @@ module Prism # This represents an arbitrarily-sized integer. When it gets to Ruby it will # be an Integer. class IntegerField < Field + def c_param + "pm_integer_t #{name}" + end + def rbs_class "Integer" end - def rbi_class + def call_seq_type "Integer" end @@ -385,11 +441,15 @@ module Prism # This represents a double-precision floating point number. When it gets to # Ruby it will be a Float. class DoubleField < Field + def c_param + "double #{name}" + end + def rbs_class "Float" end - def rbi_class + def call_seq_type "Float" end @@ -432,9 +492,6 @@ module Prism when "pattern expression" # the list of all possible types is too long with 37+ different classes "Node" - when Hash - kind = kind.fetch("on error") - REMOVE_ON_ERROR_TYPES ? nil : kind else kind end @@ -547,33 +604,17 @@ module Prism extension = File.extname(filepath.gsub(".erb", "")) heading = - case extension - when ".rb" + if extension == ".rb" <<~HEADING # frozen_string_literal: true + # :markup: markdown =begin + -- This file is generated by the templates/template.rb script and should not be modified manually. See #{filepath} if you are looking to modify the template - =end - - HEADING - when ".rbs" - <<~HEADING - # This file is generated by the templates/template.rb script and should not be - # modified manually. See #{filepath} - # if you are looking to modify the template - - HEADING - when ".rbi" - <<~HEADING - # typed: strict - - =begin - This file is generated by the templates/template.rb script and should not be - modified manually. See #{filepath} - if you are looking to modify the template + ++ =end HEADING @@ -582,7 +623,7 @@ module Prism /*----------------------------------------------------------------------------*/ /* This file is generated by the templates/template.rb script and should not */ /* be modified manually. See */ - /* #{filepath + " " * (74 - filepath.size) } */ + /* #{filepath.ljust(74)} */ /* if you are looking to modify the */ /* template */ /*----------------------------------------------------------------------------*/ @@ -602,8 +643,14 @@ module Prism end end - FileUtils.mkdir_p(File.dirname(write_to)) - File.write(write_to, contents) + begin + FileUtils.mkdir_p(File.dirname(write_to)) + File.write(write_to, contents) + rescue SystemCallError # EACCES, EPERM, EROFS, etc. + # Fall back to the current directory + FileUtils.mkdir_p(File.dirname(name)) + File.write(name, contents) + end end private @@ -639,13 +686,13 @@ module Prism TEMPLATES = [ "ext/prism/api_node.c", "include/prism/ast.h", - "include/prism/diagnostic.h", + "include/prism/internal/diagnostic.h", "javascript/src/deserialize.js", "javascript/src/nodes.js", "javascript/src/visitor.js", - "java/org/prism/Loader.java", - "java/org/prism/Nodes.java", - "java/org/prism/AbstractNodeVisitor.java", + "java/api/src/main/java-templates/org/ruby_lang/prism/Loader.java", + "java/api/src/main/java-templates/org/ruby_lang/prism/Nodes.java", + "java/api/src/main/java-templates/org/ruby_lang/prism/AbstractNodeVisitor.java", "lib/prism/compiler.rb", "lib/prism/dispatcher.rb", "lib/prism/dot_visitor.rb", @@ -657,19 +704,11 @@ module Prism "lib/prism/serialize.rb", "lib/prism/visitor.rb", "src/diagnostic.c", + "src/json.c", "src/node.c", "src/prettyprint.c", "src/serialize.c", - "src/token_type.c", - "rbi/prism/dsl.rbi", - "rbi/prism/node.rbi", - "rbi/prism/visitor.rbi", - "sig/prism.rbs", - "sig/prism/dsl.rbs", - "sig/prism/mutation_compiler.rbs", - "sig/prism/node.rbs", - "sig/prism/visitor.rbs", - "sig/prism/_private/dot_visitor.rbs" + "src/tokens.c" ] end end diff --git a/prism/util/pm_buffer.h b/prism/util/pm_buffer.h deleted file mode 100644 index f3c20ab2a5..0000000000 --- a/prism/util/pm_buffer.h +++ /dev/null @@ -1,228 +0,0 @@ -/** - * @file pm_buffer.h - * - * A wrapper around a contiguous block of allocated memory. - */ -#ifndef PRISM_BUFFER_H -#define PRISM_BUFFER_H - -#include "prism/defines.h" -#include "prism/util/pm_char.h" - -#include <assert.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdlib.h> -#include <string.h> - -/** - * A pm_buffer_t is a simple memory buffer that stores data in a contiguous - * block of memory. - */ -typedef struct { - /** The length of the buffer in bytes. */ - size_t length; - - /** The capacity of the buffer in bytes that has been allocated. */ - size_t capacity; - - /** A pointer to the start of the buffer. */ - char *value; -} pm_buffer_t; - -/** - * Return the size of the pm_buffer_t struct. - * - * @returns The size of the pm_buffer_t struct. - */ -PRISM_EXPORTED_FUNCTION size_t pm_buffer_sizeof(void); - -/** - * Initialize a pm_buffer_t with the given capacity. - * - * @param buffer The buffer to initialize. - * @param capacity The capacity of the buffer. - * @returns True if the buffer was initialized successfully, false otherwise. - */ -bool pm_buffer_init_capacity(pm_buffer_t *buffer, size_t capacity); - -/** - * Initialize a pm_buffer_t with its default values. - * - * @param buffer The buffer to initialize. - * @returns True if the buffer was initialized successfully, false otherwise. - */ -PRISM_EXPORTED_FUNCTION bool pm_buffer_init(pm_buffer_t *buffer); - -/** - * Return the value of the buffer. - * - * @param buffer The buffer to get the value of. - * @returns The value of the buffer. - */ -PRISM_EXPORTED_FUNCTION char * pm_buffer_value(const pm_buffer_t *buffer); - -/** - * Return the length of the buffer. - * - * @param buffer The buffer to get the length of. - * @returns The length of the buffer. - */ -PRISM_EXPORTED_FUNCTION size_t pm_buffer_length(const pm_buffer_t *buffer); - -/** - * Append the given amount of space as zeroes to the buffer. - * - * @param buffer The buffer to append to. - * @param length The amount of space to append and zero. - */ -void pm_buffer_append_zeroes(pm_buffer_t *buffer, size_t length); - -/** - * Append a formatted string to the buffer. - * - * @param buffer The buffer to append to. - * @param format The format string to append. - * @param ... The arguments to the format string. - */ -void pm_buffer_append_format(pm_buffer_t *buffer, const char *format, ...) PRISM_ATTRIBUTE_FORMAT(2, 3); - -/** - * Append a string to the buffer. - * - * @param buffer The buffer to append to. - * @param value The string to append. - * @param length The length of the string to append. - */ -void pm_buffer_append_string(pm_buffer_t *buffer, const char *value, size_t length); - -/** - * Append a list of bytes to the buffer. - * - * @param buffer The buffer to append to. - * @param value The bytes to append. - * @param length The length of the bytes to append. - */ -void pm_buffer_append_bytes(pm_buffer_t *buffer, const uint8_t *value, size_t length); - -/** - * Append a single byte to the buffer. - * - * @param buffer The buffer to append to. - * @param value The byte to append. - */ -void pm_buffer_append_byte(pm_buffer_t *buffer, uint8_t value); - -/** - * Append a 32-bit unsigned integer to the buffer as a variable-length integer. - * - * @param buffer The buffer to append to. - * @param value The integer to append. - */ -void pm_buffer_append_varuint(pm_buffer_t *buffer, uint32_t value); - -/** - * Append a 32-bit signed integer to the buffer as a variable-length integer. - * - * @param buffer The buffer to append to. - * @param value The integer to append. - */ -void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value); - -/** - * Append a double to the buffer. - * - * @param buffer The buffer to append to. - * @param value The double to append. - */ -void pm_buffer_append_double(pm_buffer_t *buffer, double value); - -/** - * Append a unicode codepoint to the buffer. - * - * @param buffer The buffer to append to. - * @param value The character to append. - * @returns True if the codepoint was valid and appended successfully, false - * otherwise. - */ -bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value); - -/** - * The different types of escaping that can be performed by the buffer when - * appending a slice of Ruby source code. - */ -typedef enum { - PM_BUFFER_ESCAPING_RUBY, - PM_BUFFER_ESCAPING_JSON -} pm_buffer_escaping_t; - -/** - * Append a slice of source code to the buffer. - * - * @param buffer The buffer to append to. - * @param source The source code to append. - * @param length The length of the source code to append. - * @param escaping The type of escaping to perform. - */ -void pm_buffer_append_source(pm_buffer_t *buffer, const uint8_t *source, size_t length, pm_buffer_escaping_t escaping); - -/** - * Prepend the given string to the buffer. - * - * @param buffer The buffer to prepend to. - * @param value The string to prepend. - * @param length The length of the string to prepend. - */ -void pm_buffer_prepend_string(pm_buffer_t *buffer, const char *value, size_t length); - -/** - * Concatenate one buffer onto another. - * - * @param destination The buffer to concatenate onto. - * @param source The buffer to concatenate. - */ -void pm_buffer_concat(pm_buffer_t *destination, const pm_buffer_t *source); - -/** - * Clear the buffer by reducing its size to 0. This does not free the allocated - * memory, but it does allow the buffer to be reused. - * - * @param buffer The buffer to clear. - */ -void pm_buffer_clear(pm_buffer_t *buffer); - -/** - * Strip the whitespace from the end of the buffer. - * - * @param buffer The buffer to strip. - */ -void pm_buffer_rstrip(pm_buffer_t *buffer); - -/** - * Checks if the buffer includes the given value. - * - * @param buffer The buffer to check. - * @param value The value to check for. - * @returns The index of the first occurrence of the value in the buffer, or - * SIZE_MAX if the value is not found. - */ -size_t pm_buffer_index(const pm_buffer_t *buffer, char value); - -/** - * Insert the given string into the buffer at the given index. - * - * @param buffer The buffer to insert into. - * @param index The index to insert at. - * @param value The string to insert. - * @param length The length of the string to insert. - */ -void pm_buffer_insert(pm_buffer_t *buffer, size_t index, const char *value, size_t length); - -/** - * Free the memory associated with the buffer. - * - * @param buffer The buffer to free. - */ -PRISM_EXPORTED_FUNCTION void pm_buffer_free(pm_buffer_t *buffer); - -#endif diff --git a/prism/util/pm_char.h b/prism/util/pm_char.h deleted file mode 100644 index deeafd6321..0000000000 --- a/prism/util/pm_char.h +++ /dev/null @@ -1,204 +0,0 @@ -/** - * @file pm_char.h - * - * Functions for working with characters and strings. - */ -#ifndef PRISM_CHAR_H -#define PRISM_CHAR_H - -#include "prism/defines.h" -#include "prism/util/pm_newline_list.h" - -#include <stdbool.h> -#include <stddef.h> - -/** - * Returns the number of characters at the start of the string that are - * whitespace. Disallows searching past the given maximum number of characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @return The number of characters at the start of the string that are - * whitespace. - */ -size_t pm_strspn_whitespace(const uint8_t *string, ptrdiff_t length); - -/** - * Returns the number of characters at the start of the string that are - * whitespace while also tracking the location of each newline. Disallows - * searching past the given maximum number of characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @param newline_list The list of newlines to populate. - * @return The number of characters at the start of the string that are - * whitespace. - */ -size_t pm_strspn_whitespace_newlines(const uint8_t *string, ptrdiff_t length, pm_newline_list_t *newline_list); - -/** - * Returns the number of characters at the start of the string that are inline - * whitespace. Disallows searching past the given maximum number of characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @return The number of characters at the start of the string that are inline - * whitespace. - */ -size_t pm_strspn_inline_whitespace(const uint8_t *string, ptrdiff_t length); - -/** - * Returns the number of characters at the start of the string that are decimal - * digits. Disallows searching past the given maximum number of characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @return The number of characters at the start of the string that are decimal - * digits. - */ -size_t pm_strspn_decimal_digit(const uint8_t *string, ptrdiff_t length); - -/** - * Returns the number of characters at the start of the string that are - * hexadecimal digits. Disallows searching past the given maximum number of - * characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @return The number of characters at the start of the string that are - * hexadecimal digits. - */ -size_t pm_strspn_hexadecimal_digit(const uint8_t *string, ptrdiff_t length); - -/** - * Returns the number of characters at the start of the string that are octal - * digits or underscores. Disallows searching past the given maximum number of - * characters. - * - * If multiple underscores are found in a row or if an underscore is - * found at the end of the number, then the invalid pointer is set to the index - * of the first invalid underscore. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @param invalid The pointer to set to the index of the first invalid - * underscore. - * @return The number of characters at the start of the string that are octal - * digits or underscores. - */ -size_t pm_strspn_octal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); - -/** - * Returns the number of characters at the start of the string that are decimal - * digits or underscores. Disallows searching past the given maximum number of - * characters. - * - * If multiple underscores are found in a row or if an underscore is - * found at the end of the number, then the invalid pointer is set to the index - * of the first invalid underscore. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @param invalid The pointer to set to the index of the first invalid - * underscore. - * @return The number of characters at the start of the string that are decimal - * digits or underscores. - */ -size_t pm_strspn_decimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); - -/** - * Returns the number of characters at the start of the string that are - * hexadecimal digits or underscores. Disallows searching past the given maximum - * number of characters. - * - * If multiple underscores are found in a row or if an underscore is - * found at the end of the number, then the invalid pointer is set to the index - * of the first invalid underscore. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @param invalid The pointer to set to the index of the first invalid - * underscore. - * @return The number of characters at the start of the string that are - * hexadecimal digits or underscores. - */ -size_t pm_strspn_hexadecimal_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); - -/** - * Returns the number of characters at the start of the string that are regexp - * options. Disallows searching past the given maximum number of characters. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @return The number of characters at the start of the string that are regexp - * options. - */ -size_t pm_strspn_regexp_option(const uint8_t *string, ptrdiff_t length); - -/** - * Returns the number of characters at the start of the string that are binary - * digits or underscores. Disallows searching past the given maximum number of - * characters. - * - * If multiple underscores are found in a row or if an underscore is - * found at the end of the number, then the invalid pointer is set to the index - * of the first invalid underscore. - * - * @param string The string to search. - * @param length The maximum number of characters to search. - * @param invalid The pointer to set to the index of the first invalid - * underscore. - * @return The number of characters at the start of the string that are binary - * digits or underscores. - */ -size_t pm_strspn_binary_number(const uint8_t *string, ptrdiff_t length, const uint8_t **invalid); - -/** - * Returns true if the given character is a whitespace character. - * - * @param b The character to check. - * @return True if the given character is a whitespace character. - */ -bool pm_char_is_whitespace(const uint8_t b); - -/** - * Returns true if the given character is an inline whitespace character. - * - * @param b The character to check. - * @return True if the given character is an inline whitespace character. - */ -bool pm_char_is_inline_whitespace(const uint8_t b); - -/** - * Returns true if the given character is a binary digit. - * - * @param b The character to check. - * @return True if the given character is a binary digit. - */ -bool pm_char_is_binary_digit(const uint8_t b); - -/** - * Returns true if the given character is an octal digit. - * - * @param b The character to check. - * @return True if the given character is an octal digit. - */ -bool pm_char_is_octal_digit(const uint8_t b); - -/** - * Returns true if the given character is a decimal digit. - * - * @param b The character to check. - * @return True if the given character is a decimal digit. - */ -bool pm_char_is_decimal_digit(const uint8_t b); - -/** - * Returns true if the given character is a hexadecimal digit. - * - * @param b The character to check. - * @return True if the given character is a hexadecimal digit. - */ -bool pm_char_is_hexadecimal_digit(const uint8_t b); - -#endif diff --git a/prism/util/pm_constant_pool.h b/prism/util/pm_constant_pool.h deleted file mode 100644 index 6df23f8f50..0000000000 --- a/prism/util/pm_constant_pool.h +++ /dev/null @@ -1,218 +0,0 @@ -/** - * @file pm_constant_pool.h - * - * A data structure that stores a set of strings. - * - * Each string is assigned a unique id, which can be used to compare strings for - * equality. This comparison ends up being much faster than strcmp, since it - * only requires a single integer comparison. - */ -#ifndef PRISM_CONSTANT_POOL_H -#define PRISM_CONSTANT_POOL_H - -#include "prism/defines.h" - -#include <assert.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdlib.h> -#include <string.h> - -/** - * When we allocate constants into the pool, we reserve 0 to mean that the slot - * is not yet filled. This constant is reused in other places to indicate the - * lack of a constant id. - */ -#define PM_CONSTANT_ID_UNSET 0 - -/** - * A constant id is a unique identifier for a constant in the constant pool. - */ -typedef uint32_t pm_constant_id_t; - -/** - * A list of constant IDs. Usually used to represent a set of locals. - */ -typedef struct { - /** The number of constant ids in the list. */ - size_t size; - - /** The number of constant ids that have been allocated in the list. */ - size_t capacity; - - /** The constant ids in the list. */ - pm_constant_id_t *ids; -} pm_constant_id_list_t; - -/** - * Initialize a list of constant ids. - * - * @param list The list to initialize. - */ -void pm_constant_id_list_init(pm_constant_id_list_t *list); - -/** - * Initialize a list of constant ids with a given capacity. - * - * @param list The list to initialize. - * @param capacity The initial capacity of the list. - */ -void pm_constant_id_list_init_capacity(pm_constant_id_list_t *list, size_t capacity); - -/** - * Append a constant id to a list of constant ids. Returns false if any - * potential reallocations fail. - * - * @param list The list to append to. - * @param id The id to append. - * @return Whether the append succeeded. - */ -bool pm_constant_id_list_append(pm_constant_id_list_t *list, pm_constant_id_t id); - -/** - * Insert a constant id into a list of constant ids at the specified index. - * - * @param list The list to insert into. - * @param index The index at which to insert. - * @param id The id to insert. - */ -void pm_constant_id_list_insert(pm_constant_id_list_t *list, size_t index, pm_constant_id_t id); - -/** - * Checks if the current constant id list includes the given constant id. - * - * @param list The list to check. - * @param id The id to check for. - * @return Whether the list includes the given id. - */ -bool pm_constant_id_list_includes(pm_constant_id_list_t *list, pm_constant_id_t id); - -/** - * Free the memory associated with a list of constant ids. - * - * @param list The list to free. - */ -void pm_constant_id_list_free(pm_constant_id_list_t *list); - -/** - * The type of bucket in the constant pool hash map. This determines how the - * bucket should be freed. - */ -typedef unsigned int pm_constant_pool_bucket_type_t; - -/** By default, each constant is a slice of the source. */ -static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_DEFAULT = 0; - -/** An owned constant is one for which memory has been allocated. */ -static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_OWNED = 1; - -/** A constant constant is known at compile time. */ -static const pm_constant_pool_bucket_type_t PM_CONSTANT_POOL_BUCKET_CONSTANT = 2; - -/** A bucket in the hash map. */ -typedef struct { - /** The incremental ID used for indexing back into the pool. */ - unsigned int id: 30; - - /** The type of the bucket, which determines how to free it. */ - pm_constant_pool_bucket_type_t type: 2; - - /** The hash of the bucket. */ - uint32_t hash; -} pm_constant_pool_bucket_t; - -/** A constant in the pool which effectively stores a string. */ -typedef struct { - /** A pointer to the start of the string. */ - const uint8_t *start; - - /** The length of the string. */ - size_t length; -} pm_constant_t; - -/** The overall constant pool, which stores constants found while parsing. */ -typedef struct { - /** The buckets in the hash map. */ - pm_constant_pool_bucket_t *buckets; - - /** The constants that are stored in the buckets. */ - pm_constant_t *constants; - - /** The number of buckets in the hash map. */ - uint32_t size; - - /** The number of buckets that have been allocated in the hash map. */ - uint32_t capacity; -} pm_constant_pool_t; - -/** - * Initialize a new constant pool with a given capacity. - * - * @param pool The pool to initialize. - * @param capacity The initial capacity of the pool. - * @return Whether the initialization succeeded. - */ -bool pm_constant_pool_init(pm_constant_pool_t *pool, uint32_t capacity); - -/** - * Return a pointer to the constant indicated by the given constant id. - * - * @param pool The pool to get the constant from. - * @param constant_id The id of the constant to get. - * @return A pointer to the constant. - */ -pm_constant_t * pm_constant_pool_id_to_constant(const pm_constant_pool_t *pool, pm_constant_id_t constant_id); - -/** - * Find a constant in a constant pool. Returns the id of the constant, or 0 if - * the constant is not found. - * - * @param pool The pool to find the constant in. - * @param start A pointer to the start of the constant. - * @param length The length of the constant. - * @return The id of the constant. - */ -pm_constant_id_t pm_constant_pool_find(const pm_constant_pool_t *pool, const uint8_t *start, size_t length); - -/** - * Insert a constant into a constant pool that is a slice of a source string. - * Returns the id of the constant, or 0 if any potential calls to resize fail. - * - * @param pool The pool to insert the constant into. - * @param start A pointer to the start of the constant. - * @param length The length of the constant. - * @return The id of the constant. - */ -pm_constant_id_t pm_constant_pool_insert_shared(pm_constant_pool_t *pool, const uint8_t *start, size_t length); - -/** - * Insert a constant into a constant pool from memory that is now owned by the - * constant pool. Returns the id of the constant, or 0 if any potential calls to - * resize fail. - * - * @param pool The pool to insert the constant into. - * @param start A pointer to the start of the constant. - * @param length The length of the constant. - * @return The id of the constant. - */ -pm_constant_id_t pm_constant_pool_insert_owned(pm_constant_pool_t *pool, uint8_t *start, size_t length); - -/** - * Insert a constant into a constant pool from memory that is constant. Returns - * the id of the constant, or 0 if any potential calls to resize fail. - * - * @param pool The pool to insert the constant into. - * @param start A pointer to the start of the constant. - * @param length The length of the constant. - * @return The id of the constant. - */ -pm_constant_id_t pm_constant_pool_insert_constant(pm_constant_pool_t *pool, const uint8_t *start, size_t length); - -/** - * Free the memory associated with a constant pool. - * - * @param pool The pool to free. - */ -void pm_constant_pool_free(pm_constant_pool_t *pool); - -#endif diff --git a/prism/util/pm_integer.h b/prism/util/pm_integer.h deleted file mode 100644 index a9e2966703..0000000000 --- a/prism/util/pm_integer.h +++ /dev/null @@ -1,126 +0,0 @@ -/** - * @file pm_integer.h - * - * This module provides functions for working with arbitrary-sized integers. - */ -#ifndef PRISM_NUMBER_H -#define PRISM_NUMBER_H - -#include "prism/defines.h" -#include "prism/util/pm_buffer.h" - -#include <assert.h> -#include <stdbool.h> -#include <stdint.h> -#include <stdlib.h> - -/** - * A structure represents an arbitrary-sized integer. - */ -typedef struct { - /** - * The number of allocated values. length is set to 0 if the integer fits - * into uint32_t. - */ - size_t length; - - /** - * List of 32-bit integers. Set to NULL if the integer fits into uint32_t. - */ - uint32_t *values; - - /** - * Embedded value for small integer. This value is set to 0 if the value - * does not fit into uint32_t. - */ - uint32_t value; - - /** - * Whether or not the integer is negative. It is stored this way so that a - * zeroed pm_integer_t is always positive zero. - */ - bool negative; -} pm_integer_t; - -/** - * An enum controlling the base of an integer. It is expected that the base is - * already known before parsing the integer, even though it could be derived - * from the string itself. - */ -typedef enum { - /** The default decimal base, with no prefix. Leading 0s will be ignored. */ - PM_INTEGER_BASE_DEFAULT, - - /** The binary base, indicated by a 0b or 0B prefix. */ - PM_INTEGER_BASE_BINARY, - - /** The octal base, indicated by a 0, 0o, or 0O prefix. */ - PM_INTEGER_BASE_OCTAL, - - /** The decimal base, indicated by a 0d, 0D, or empty prefix. */ - PM_INTEGER_BASE_DECIMAL, - - /** The hexadecimal base, indicated by a 0x or 0X prefix. */ - PM_INTEGER_BASE_HEXADECIMAL, - - /** - * An unknown base, in which case pm_integer_parse will derive it based on - * the content of the string. This is less efficient and does more - * comparisons, so if callers know the base ahead of time, they should use - * that instead. - */ - PM_INTEGER_BASE_UNKNOWN -} pm_integer_base_t; - -/** - * Parse an integer from a string. This assumes that the format of the integer - * has already been validated, as internal validation checks are not performed - * here. - * - * @param integer The integer to parse into. - * @param base The base of the integer. - * @param start The start of the string. - * @param end The end of the string. - */ -void pm_integer_parse(pm_integer_t *integer, pm_integer_base_t base, const uint8_t *start, const uint8_t *end); - -/** - * Compare two integers. This function returns -1 if the left integer is less - * than the right integer, 0 if they are equal, and 1 if the left integer is - * greater than the right integer. - * - * @param left The left integer to compare. - * @param right The right integer to compare. - * @return The result of the comparison. - */ -int pm_integer_compare(const pm_integer_t *left, const pm_integer_t *right); - -/** - * Reduce a ratio of integers to its simplest form. - * - * If either the numerator or denominator do not fit into a 32-bit integer, then - * this function is a no-op. In the future, we may consider reducing even the - * larger numbers, but for now we're going to keep it simple. - * - * @param numerator The numerator of the ratio. - * @param denominator The denominator of the ratio. - */ -void pm_integers_reduce(pm_integer_t *numerator, pm_integer_t *denominator); - -/** - * Convert an integer to a decimal string. - * - * @param buffer The buffer to append the string to. - * @param integer The integer to convert to a string. - */ -PRISM_EXPORTED_FUNCTION void pm_integer_string(pm_buffer_t *buffer, const pm_integer_t *integer); - -/** - * Free the internal memory of an integer. This memory will only be allocated if - * the integer exceeds the size of a single node in the linked list. - * - * @param integer The integer to free. - */ -PRISM_EXPORTED_FUNCTION void pm_integer_free(pm_integer_t *integer); - -#endif diff --git a/prism/util/pm_list.c b/prism/util/pm_list.c deleted file mode 100644 index ad2294cd60..0000000000 --- a/prism/util/pm_list.c +++ /dev/null @@ -1,49 +0,0 @@ -#include "prism/util/pm_list.h" - -/** - * Returns true if the given list is empty. - */ -PRISM_EXPORTED_FUNCTION bool -pm_list_empty_p(pm_list_t *list) { - return list->head == NULL; -} - -/** - * Returns the size of the list. - */ -PRISM_EXPORTED_FUNCTION size_t -pm_list_size(pm_list_t *list) { - return list->size; -} - -/** - * Append a node to the given list. - */ -void -pm_list_append(pm_list_t *list, pm_list_node_t *node) { - if (list->head == NULL) { - list->head = node; - } else { - list->tail->next = node; - } - - list->tail = node; - list->size++; -} - -/** - * Deallocate the internal state of the given list. - */ -PRISM_EXPORTED_FUNCTION void -pm_list_free(pm_list_t *list) { - pm_list_node_t *node = list->head; - pm_list_node_t *next; - - while (node != NULL) { - next = node->next; - xfree(node); - node = next; - } - - list->size = 0; -} diff --git a/prism/util/pm_memchr.h b/prism/util/pm_memchr.h deleted file mode 100644 index e0671eaed3..0000000000 --- a/prism/util/pm_memchr.h +++ /dev/null @@ -1,29 +0,0 @@ -/** - * @file pm_memchr.h - * - * A custom memchr implementation. - */ -#ifndef PRISM_MEMCHR_H -#define PRISM_MEMCHR_H - -#include "prism/defines.h" -#include "prism/encoding.h" - -#include <stddef.h> - -/** - * We need to roll our own memchr to handle cases where the encoding changes and - * we need to search for a character in a buffer that could be the trailing byte - * of a multibyte character. - * - * @param source The source string. - * @param character The character to search for. - * @param number The maximum number of bytes to search. - * @param encoding_changed Whether the encoding changed. - * @param encoding A pointer to the encoding. - * @return A pointer to the first occurrence of the character in the source - * string, or NULL if no such character exists. - */ -void * pm_memchr(const void *source, int character, size_t number, bool encoding_changed, const pm_encoding_t *encoding); - -#endif diff --git a/prism/util/pm_newline_list.c b/prism/util/pm_newline_list.c deleted file mode 100644 index 8331618f54..0000000000 --- a/prism/util/pm_newline_list.c +++ /dev/null @@ -1,125 +0,0 @@ -#include "prism/util/pm_newline_list.h" - -/** - * Initialize a new newline list with the given capacity. Returns true if the - * allocation of the offsets succeeds, otherwise returns false. - */ -bool -pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity) { - list->offsets = (size_t *) xcalloc(capacity, sizeof(size_t)); - if (list->offsets == NULL) return false; - - list->start = start; - - // This is 1 instead of 0 because we want to include the first line of the - // file as having offset 0, which is set because of calloc. - list->size = 1; - list->capacity = capacity; - - return true; -} - -/** - * Clear out the newlines that have been appended to the list. - */ -void -pm_newline_list_clear(pm_newline_list_t *list) { - list->size = 1; -} - -/** - * Append a new offset to the newline list. Returns true if the reallocation of - * the offsets succeeds (if one was necessary), otherwise returns false. - */ -bool -pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor) { - if (list->size == list->capacity) { - size_t *original_offsets = list->offsets; - - list->capacity = (list->capacity * 3) / 2; - list->offsets = (size_t *) xcalloc(list->capacity, sizeof(size_t)); - if (list->offsets == NULL) return false; - - memcpy(list->offsets, original_offsets, list->size * sizeof(size_t)); - xfree(original_offsets); - } - - assert(*cursor == '\n'); - assert(cursor >= list->start); - size_t newline_offset = (size_t) (cursor - list->start + 1); - - assert(list->size == 0 || newline_offset > list->offsets[list->size - 1]); - list->offsets[list->size++] = newline_offset; - - return true; -} - -/** - * Returns the line of the given offset. If the offset is not in the list, the - * line of the closest offset less than the given offset is returned. - */ -int32_t -pm_newline_list_line(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) { - assert(cursor >= list->start); - size_t offset = (size_t) (cursor - list->start); - - size_t left = 0; - size_t right = list->size - 1; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - - if (list->offsets[mid] == offset) { - return ((int32_t) mid) + start_line; - } - - if (list->offsets[mid] < offset) { - left = mid + 1; - } else { - right = mid - 1; - } - } - - return ((int32_t) left) + start_line - 1; -} - -/** - * Returns the line and column of the given offset. If the offset is not in the - * list, the line and column of the closest offset less than the given offset - * are returned. - */ -pm_line_column_t -pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line) { - assert(cursor >= list->start); - size_t offset = (size_t) (cursor - list->start); - - size_t left = 0; - size_t right = list->size - 1; - - while (left <= right) { - size_t mid = left + (right - left) / 2; - - if (list->offsets[mid] == offset) { - return ((pm_line_column_t) { ((int32_t) mid) + start_line, 0 }); - } - - if (list->offsets[mid] < offset) { - left = mid + 1; - } else { - right = mid - 1; - } - } - - return ((pm_line_column_t) { - .line = ((int32_t) left) + start_line - 1, - .column = (uint32_t) (offset - list->offsets[left - 1]) - }); -} - -/** - * Free the internal memory allocated for the newline list. - */ -void -pm_newline_list_free(pm_newline_list_t *list) { - xfree(list->offsets); -} diff --git a/prism/util/pm_newline_list.h b/prism/util/pm_newline_list.h deleted file mode 100644 index 406abe8ba5..0000000000 --- a/prism/util/pm_newline_list.h +++ /dev/null @@ -1,113 +0,0 @@ -/** - * @file pm_newline_list.h - * - * A list of byte offsets of newlines in a string. - * - * When compiling the syntax tree, it's necessary to know the line and column - * of many nodes. This is necessary to support things like error messages, - * tracepoints, etc. - * - * It's possible that we could store the start line, start column, end line, and - * end column on every node in addition to the offsets that we already store, - * but that would be quite a lot of memory overhead. - */ -#ifndef PRISM_NEWLINE_LIST_H -#define PRISM_NEWLINE_LIST_H - -#include "prism/defines.h" - -#include <assert.h> -#include <stdbool.h> -#include <stddef.h> -#include <stdlib.h> - -/** - * A list of offsets of newlines in a string. The offsets are assumed to be - * sorted/inserted in ascending order. - */ -typedef struct { - /** A pointer to the start of the source string. */ - const uint8_t *start; - - /** The number of offsets in the list. */ - size_t size; - - /** The capacity of the list that has been allocated. */ - size_t capacity; - - /** The list of offsets. */ - size_t *offsets; -} pm_newline_list_t; - -/** - * A line and column in a string. - */ -typedef struct { - /** The line number. */ - int32_t line; - - /** The column number. */ - uint32_t column; -} pm_line_column_t; - -/** - * Initialize a new newline list with the given capacity. Returns true if the - * allocation of the offsets succeeds, otherwise returns false. - * - * @param list The list to initialize. - * @param start A pointer to the start of the source string. - * @param capacity The initial capacity of the list. - * @return True if the allocation of the offsets succeeds, otherwise false. - */ -bool pm_newline_list_init(pm_newline_list_t *list, const uint8_t *start, size_t capacity); - -/** - * Clear out the newlines that have been appended to the list. - * - * @param list The list to clear. - */ -void -pm_newline_list_clear(pm_newline_list_t *list); - -/** - * Append a new offset to the newline list. Returns true if the reallocation of - * the offsets succeeds (if one was necessary), otherwise returns false. - * - * @param list The list to append to. - * @param cursor A pointer to the offset to append. - * @return True if the reallocation of the offsets succeeds (if one was - * necessary), otherwise false. - */ -bool pm_newline_list_append(pm_newline_list_t *list, const uint8_t *cursor); - -/** - * Returns the line of the given offset. If the offset is not in the list, the - * line of the closest offset less than the given offset is returned. - * - * @param list The list to search. - * @param cursor A pointer to the offset to search for. - * @param start_line The line to start counting from. - * @return The line of the given offset. - */ -int32_t pm_newline_list_line(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line); - -/** - * Returns the line and column of the given offset. If the offset is not in the - * list, the line and column of the closest offset less than the given offset - * are returned. - * - * @param list The list to search. - * @param cursor A pointer to the offset to search for. - * @param start_line The line to start counting from. - * @return The line and column of the given offset. - */ -pm_line_column_t pm_newline_list_line_column(const pm_newline_list_t *list, const uint8_t *cursor, int32_t start_line); - -/** - * Free the internal memory allocated for the newline list. - * - * @param list The list to free. - */ -void pm_newline_list_free(pm_newline_list_t *list); - -#endif diff --git a/prism/util/pm_string.c b/prism/util/pm_string.c deleted file mode 100644 index 75422fbdf2..0000000000 --- a/prism/util/pm_string.c +++ /dev/null @@ -1,383 +0,0 @@ -#include "prism/util/pm_string.h" - -/** - * Returns the size of the pm_string_t struct. This is necessary to allocate the - * correct amount of memory in the FFI backend. - */ -PRISM_EXPORTED_FUNCTION size_t -pm_string_sizeof(void) { - return sizeof(pm_string_t); -} - -/** - * Initialize a shared string that is based on initial input. - */ -void -pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end) { - assert(start <= end); - - *string = (pm_string_t) { - .type = PM_STRING_SHARED, - .source = start, - .length = (size_t) (end - start) - }; -} - -/** - * Initialize an owned string that is responsible for freeing allocated memory. - */ -void -pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length) { - *string = (pm_string_t) { - .type = PM_STRING_OWNED, - .source = source, - .length = length - }; -} - -/** - * Initialize a constant string that doesn't own its memory source. - */ -void -pm_string_constant_init(pm_string_t *string, const char *source, size_t length) { - *string = (pm_string_t) { - .type = PM_STRING_CONSTANT, - .source = (const uint8_t *) source, - .length = length - }; -} - -#ifdef _WIN32 -/** - * Represents a file handle on Windows, where the path will need to be freed - * when the file is closed. - */ -typedef struct { - /** The path to the file, which will become allocated memory. */ - WCHAR *path; - - /** The handle to the file, which will start as uninitialized memory. */ - HANDLE file; -} pm_string_file_handle_t; - -/** - * Open the file indicated by the filepath parameter for reading on Windows. - * Perform any kind of normalization that needs to happen on the filepath. - */ -static pm_string_init_result_t -pm_string_file_handle_open(pm_string_file_handle_t *handle, const char *filepath) { - int length = MultiByteToWideChar(CP_UTF8, 0, filepath, -1, NULL, 0); - if (length == 0) return PM_STRING_INIT_ERROR_GENERIC; - - handle->path = xmalloc(sizeof(WCHAR) * ((size_t) length)); - if ((handle->path == NULL) || (MultiByteToWideChar(CP_UTF8, 0, filepath, -1, handle->path, length) == 0)) { - xfree(handle->path); - return PM_STRING_INIT_ERROR_GENERIC; - } - - handle->file = CreateFileW(handle->path, GENERIC_READ, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_READONLY, NULL); - if (handle->file == INVALID_HANDLE_VALUE) { - pm_string_init_result_t result = PM_STRING_INIT_ERROR_GENERIC; - - if (GetLastError() == ERROR_ACCESS_DENIED) { - DWORD attributes = GetFileAttributesW(handle->path); - if ((attributes != INVALID_FILE_ATTRIBUTES) && (attributes & FILE_ATTRIBUTE_DIRECTORY)) { - result = PM_STRING_INIT_ERROR_DIRECTORY; - } - } - - xfree(handle->path); - return result; - } - - return PM_STRING_INIT_SUCCESS; -} - -/** - * Close the file handle and free the path. - */ -static void -pm_string_file_handle_close(pm_string_file_handle_t *handle) { - xfree(handle->path); - CloseHandle(handle->file); -} -#endif - -/** - * Read the file indicated by the filepath parameter into source and load its - * contents and size into the given `pm_string_t`. The given `pm_string_t` - * should be freed using `pm_string_free` when it is no longer used. - * - * We want to use demand paging as much as possible in order to avoid having to - * read the entire file into memory (which could be detrimental to performance - * for large files). This means that if we're on windows we'll use - * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use - * `mmap`, and on other POSIX systems we'll use `read`. - */ -PRISM_EXPORTED_FUNCTION pm_string_init_result_t -pm_string_mapped_init(pm_string_t *string, const char *filepath) { -#ifdef _WIN32 - // Open the file for reading. - pm_string_file_handle_t handle; - pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath); - if (result != PM_STRING_INIT_SUCCESS) return result; - - // Get the file size. - DWORD file_size = GetFileSize(handle.file, NULL); - if (file_size == INVALID_FILE_SIZE) { - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // If the file is empty, then we don't need to do anything else, we'll set - // the source to a constant empty string and return. - if (file_size == 0) { - pm_string_file_handle_close(&handle); - const uint8_t source[] = ""; - *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; - return PM_STRING_INIT_SUCCESS; - } - - // Create a mapping of the file. - HANDLE mapping = CreateFileMapping(handle.file, NULL, PAGE_READONLY, 0, 0, NULL); - if (mapping == NULL) { - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Map the file into memory. - uint8_t *source = (uint8_t *) MapViewOfFile(mapping, FILE_MAP_READ, 0, 0, 0); - CloseHandle(mapping); - pm_string_file_handle_close(&handle); - - if (source == NULL) { - return PM_STRING_INIT_ERROR_GENERIC; - } - - *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = (size_t) file_size }; - return PM_STRING_INIT_SUCCESS; -#elif defined(_POSIX_MAPPED_FILES) - // Open the file for reading - int fd = open(filepath, O_RDONLY); - if (fd == -1) { - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Stat the file to get the file size - struct stat sb; - if (fstat(fd, &sb) == -1) { - close(fd); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Ensure it is a file and not a directory - if (S_ISDIR(sb.st_mode)) { - close(fd); - return PM_STRING_INIT_ERROR_DIRECTORY; - } - - // mmap the file descriptor to virtually get the contents - size_t size = (size_t) sb.st_size; - uint8_t *source = NULL; - - if (size == 0) { - close(fd); - const uint8_t source[] = ""; - *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; - return PM_STRING_INIT_SUCCESS; - } - - source = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); - if (source == MAP_FAILED) { - close(fd); - return PM_STRING_INIT_ERROR_GENERIC; - } - - close(fd); - *string = (pm_string_t) { .type = PM_STRING_MAPPED, .source = source, .length = size }; - return PM_STRING_INIT_SUCCESS; -#else - return pm_string_file_init(string, filepath); -#endif -} - -/** - * Read the file indicated by the filepath parameter into source and load its - * contents and size into the given `pm_string_t`. The given `pm_string_t` - * should be freed using `pm_string_free` when it is no longer used. - */ -PRISM_EXPORTED_FUNCTION pm_string_init_result_t -pm_string_file_init(pm_string_t *string, const char *filepath) { -#ifdef _WIN32 - // Open the file for reading. - pm_string_file_handle_t handle; - pm_string_init_result_t result = pm_string_file_handle_open(&handle, filepath); - if (result != PM_STRING_INIT_SUCCESS) return result; - - // Get the file size. - DWORD file_size = GetFileSize(handle.file, NULL); - if (file_size == INVALID_FILE_SIZE) { - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // If the file is empty, then we don't need to do anything else, we'll set - // the source to a constant empty string and return. - if (file_size == 0) { - pm_string_file_handle_close(&handle); - const uint8_t source[] = ""; - *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; - return PM_STRING_INIT_SUCCESS; - } - - // Create a buffer to read the file into. - uint8_t *source = xmalloc(file_size); - if (source == NULL) { - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Read the contents of the file - DWORD bytes_read; - if (!ReadFile(handle.file, source, file_size, &bytes_read, NULL)) { - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Check the number of bytes read - if (bytes_read != file_size) { - xfree(source); - pm_string_file_handle_close(&handle); - return PM_STRING_INIT_ERROR_GENERIC; - } - - pm_string_file_handle_close(&handle); - *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = (size_t) file_size }; - return PM_STRING_INIT_SUCCESS; -#elif defined(PRISM_HAS_FILESYSTEM) - // Open the file for reading - int fd = open(filepath, O_RDONLY); - if (fd == -1) { - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Stat the file to get the file size - struct stat sb; - if (fstat(fd, &sb) == -1) { - close(fd); - return PM_STRING_INIT_ERROR_GENERIC; - } - - // Ensure it is a file and not a directory - if (S_ISDIR(sb.st_mode)) { - close(fd); - return PM_STRING_INIT_ERROR_DIRECTORY; - } - - // Check the size to see if it's empty - size_t size = (size_t) sb.st_size; - if (size == 0) { - close(fd); - const uint8_t source[] = ""; - *string = (pm_string_t) { .type = PM_STRING_CONSTANT, .source = source, .length = 0 }; - return PM_STRING_INIT_SUCCESS; - } - - size_t length = (size_t) size; - uint8_t *source = xmalloc(length); - if (source == NULL) { - close(fd); - return PM_STRING_INIT_ERROR_GENERIC; - } - - long bytes_read = (long) read(fd, source, length); - close(fd); - - if (bytes_read == -1) { - xfree(source); - return PM_STRING_INIT_ERROR_GENERIC; - } - - *string = (pm_string_t) { .type = PM_STRING_OWNED, .source = source, .length = length }; - return PM_STRING_INIT_SUCCESS; -#else - (void) string; - (void) filepath; - perror("pm_string_file_init is not implemented for this platform"); - return PM_STRING_INIT_ERROR_GENERIC; -#endif -} - -/** - * Ensure the string is owned. If it is not, then reinitialize it as owned and - * copy over the previous source. - */ -void -pm_string_ensure_owned(pm_string_t *string) { - if (string->type == PM_STRING_OWNED) return; - - size_t length = pm_string_length(string); - const uint8_t *source = pm_string_source(string); - - uint8_t *memory = xmalloc(length); - if (!memory) return; - - pm_string_owned_init(string, memory, length); - memcpy((void *) string->source, source, length); -} - -/** - * Compare the underlying lengths and bytes of two strings. Returns 0 if the - * strings are equal, a negative number if the left string is less than the - * right string, and a positive number if the left string is greater than the - * right string. - */ -int -pm_string_compare(const pm_string_t *left, const pm_string_t *right) { - size_t left_length = pm_string_length(left); - size_t right_length = pm_string_length(right); - - if (left_length < right_length) { - return -1; - } else if (left_length > right_length) { - return 1; - } - - return memcmp(pm_string_source(left), pm_string_source(right), left_length); -} - -/** - * Returns the length associated with the string. - */ -PRISM_EXPORTED_FUNCTION size_t -pm_string_length(const pm_string_t *string) { - return string->length; -} - -/** - * Returns the start pointer associated with the string. - */ -PRISM_EXPORTED_FUNCTION const uint8_t * -pm_string_source(const pm_string_t *string) { - return string->source; -} - -/** - * Free the associated memory of the given string. - */ -PRISM_EXPORTED_FUNCTION void -pm_string_free(pm_string_t *string) { - void *memory = (void *) string->source; - - if (string->type == PM_STRING_OWNED) { - xfree(memory); -#ifdef PRISM_HAS_MMAP - } else if (string->type == PM_STRING_MAPPED && string->length) { -#if defined(_WIN32) - UnmapViewOfFile(memory); -#elif defined(_POSIX_MAPPED_FILES) - munmap(memory, string->length); -#endif -#endif /* PRISM_HAS_MMAP */ - } -} diff --git a/prism/util/pm_string.h b/prism/util/pm_string.h deleted file mode 100644 index f99f1abdf3..0000000000 --- a/prism/util/pm_string.h +++ /dev/null @@ -1,190 +0,0 @@ -/** - * @file pm_string.h - * - * A generic string type that can have various ownership semantics. - */ -#ifndef PRISM_STRING_H -#define PRISM_STRING_H - -#include "prism/defines.h" - -#include <assert.h> -#include <errno.h> -#include <stdbool.h> -#include <stddef.h> -#include <stdlib.h> -#include <string.h> - -// The following headers are necessary to read files using demand paging. -#ifdef _WIN32 -#include <windows.h> -#elif defined(_POSIX_MAPPED_FILES) -#include <fcntl.h> -#include <sys/mman.h> -#include <sys/stat.h> -#elif defined(PRISM_HAS_FILESYSTEM) -#include <fcntl.h> -#include <sys/stat.h> -#endif - -/** - * A generic string type that can have various ownership semantics. - */ -typedef struct { - /** A pointer to the start of the string. */ - const uint8_t *source; - - /** The length of the string in bytes of memory. */ - size_t length; - - /** The type of the string. This field determines how the string should be freed. */ - enum { - /** This string is a constant string, and should not be freed. */ - PM_STRING_CONSTANT, - - /** This is a slice of another string, and should not be freed. */ - PM_STRING_SHARED, - - /** This string owns its memory, and should be freed using `pm_string_free`. */ - PM_STRING_OWNED, - -#ifdef PRISM_HAS_MMAP - /** This string is a memory-mapped file, and should be freed using `pm_string_free`. */ - PM_STRING_MAPPED -#endif - } type; -} pm_string_t; - -/** - * Returns the size of the pm_string_t struct. This is necessary to allocate the - * correct amount of memory in the FFI backend. - * - * @return The size of the pm_string_t struct. - */ -PRISM_EXPORTED_FUNCTION size_t pm_string_sizeof(void); - -/** - * Defines an empty string. This is useful for initializing a string that will - * be filled in later. - */ -#define PM_STRING_EMPTY ((pm_string_t) { .type = PM_STRING_CONSTANT, .source = NULL, .length = 0 }) - -/** - * Initialize a shared string that is based on initial input. - * - * @param string The string to initialize. - * @param start The start of the string. - * @param end The end of the string. - */ -void pm_string_shared_init(pm_string_t *string, const uint8_t *start, const uint8_t *end); - -/** - * Initialize an owned string that is responsible for freeing allocated memory. - * - * @param string The string to initialize. - * @param source The source of the string. - * @param length The length of the string. - */ -void pm_string_owned_init(pm_string_t *string, uint8_t *source, size_t length); - -/** - * Initialize a constant string that doesn't own its memory source. - * - * @param string The string to initialize. - * @param source The source of the string. - * @param length The length of the string. - */ -void pm_string_constant_init(pm_string_t *string, const char *source, size_t length); - -/** - * Represents the result of calling pm_string_mapped_init or - * pm_string_file_init. We need this additional information because there is - * not a platform-agnostic way to indicate that the file that was attempted to - * be opened was a directory. - */ -typedef enum { - /** Indicates that the string was successfully initialized. */ - PM_STRING_INIT_SUCCESS = 0, - /** - * Indicates a generic error from a string_*_init function, where the type - * of error should be read from `errno` or `GetLastError()`. - */ - PM_STRING_INIT_ERROR_GENERIC = 1, - /** - * Indicates that the file that was attempted to be opened was a directory. - */ - PM_STRING_INIT_ERROR_DIRECTORY = 2 -} pm_string_init_result_t; - -/** - * Read the file indicated by the filepath parameter into source and load its - * contents and size into the given `pm_string_t`. The given `pm_string_t` - * should be freed using `pm_string_free` when it is no longer used. - * - * We want to use demand paging as much as possible in order to avoid having to - * read the entire file into memory (which could be detrimental to performance - * for large files). This means that if we're on windows we'll use - * `MapViewOfFile`, on POSIX systems that have access to `mmap` we'll use - * `mmap`, and on other POSIX systems we'll use `read`. - * - * @param string The string to initialize. - * @param filepath The filepath to read. - * @return The success of the read, indicated by the value of the enum. - */ -PRISM_EXPORTED_FUNCTION pm_string_init_result_t pm_string_mapped_init(pm_string_t *string, const char *filepath); - -/** - * Read the file indicated by the filepath parameter into source and load its - * contents and size into the given `pm_string_t`. The given `pm_string_t` - * should be freed using `pm_string_free` when it is no longer used. - * - * @param string The string to initialize. - * @param filepath The filepath to read. - * @return The success of the read, indicated by the value of the enum. - */ -PRISM_EXPORTED_FUNCTION pm_string_init_result_t pm_string_file_init(pm_string_t *string, const char *filepath); - -/** - * Ensure the string is owned. If it is not, then reinitialize it as owned and - * copy over the previous source. - * - * @param string The string to ensure is owned. - */ -void pm_string_ensure_owned(pm_string_t *string); - -/** - * Compare the underlying lengths and bytes of two strings. Returns 0 if the - * strings are equal, a negative number if the left string is less than the - * right string, and a positive number if the left string is greater than the - * right string. - * - * @param left The left string to compare. - * @param right The right string to compare. - * @return The comparison result. - */ -int pm_string_compare(const pm_string_t *left, const pm_string_t *right); - -/** - * Returns the length associated with the string. - * - * @param string The string to get the length of. - * @return The length of the string. - */ -PRISM_EXPORTED_FUNCTION size_t pm_string_length(const pm_string_t *string); - -/** - * Returns the start pointer associated with the string. - * - * @param string The string to get the start pointer of. - * @return The start pointer of the string. - */ -PRISM_EXPORTED_FUNCTION const uint8_t * pm_string_source(const pm_string_t *string); - -/** - * Free the associated memory of the given string. - * - * @param string The string to free. - */ -PRISM_EXPORTED_FUNCTION void pm_string_free(pm_string_t *string); - -#endif diff --git a/prism/util/pm_strpbrk.c b/prism/util/pm_strpbrk.c deleted file mode 100644 index 916a4cc3fd..0000000000 --- a/prism/util/pm_strpbrk.c +++ /dev/null @@ -1,206 +0,0 @@ -#include "prism/util/pm_strpbrk.h" - -/** - * Add an invalid multibyte character error to the parser. - */ -static inline void -pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { - pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start); -} - -/** - * Set the explicit encoding for the parser to the current encoding. - */ -static inline void -pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) { - if (parser->explicit_encoding != NULL) { - if (parser->explicit_encoding == parser->encoding) { - // Okay, we already locked to this encoding. - } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - // Not okay, we already found a Unicode escape sequence and this - // conflicts. - pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name); - } else { - // Should not be anything else. - assert(false && "unreachable"); - } - } - - parser->explicit_encoding = parser->encoding; -} - -/** - * This is the default path. - */ -static inline const uint8_t * -pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { - size_t index = 0; - - while (index < maximum) { - if (strchr((const char *) charset, source[index]) != NULL) { - return source + index; - } - - if (source[index] < 0x80) { - index++; - } else { - size_t width = pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)); - - if (width > 0) { - index += width; - } else if (!validate) { - index++; - } else { - // At this point we know we have an invalid multibyte character. - // We'll walk forward as far as we can until we find the next - // valid character so that we don't spam the user with a ton of - // the same kind of error. - const size_t start = index; - - do { - index++; - } while (index < maximum && pm_encoding_utf_8_char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); - - pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); - } - } - } - - return NULL; -} - -/** - * This is the path when the encoding is ASCII-8BIT. - */ -static inline const uint8_t * -pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { - size_t index = 0; - - while (index < maximum) { - if (strchr((const char *) charset, source[index]) != NULL) { - return source + index; - } - - if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1); - index++; - } - - return NULL; -} - -/** - * This is the slow path that does care about the encoding. - */ -static inline const uint8_t * -pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { - size_t index = 0; - const pm_encoding_t *encoding = parser->encoding; - - while (index < maximum) { - if (strchr((const char *) charset, source[index]) != NULL) { - return source + index; - } - - if (source[index] < 0x80) { - index++; - } else { - size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); - if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width); - - if (width > 0) { - index += width; - } else if (!validate) { - index++; - } else { - // At this point we know we have an invalid multibyte character. - // We'll walk forward as far as we can until we find the next - // valid character so that we don't spam the user with a ton of - // the same kind of error. - const size_t start = index; - - do { - index++; - } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); - - pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); - } - } - } - - return NULL; -} - -/** - * This is the fast path that does not care about the encoding because we know - * the encoding only supports single-byte characters. - */ -static inline const uint8_t * -pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) { - size_t index = 0; - const pm_encoding_t *encoding = parser->encoding; - - while (index < maximum) { - if (strchr((const char *) charset, source[index]) != NULL) { - return source + index; - } - - if (source[index] < 0x80 || !validate) { - index++; - } else { - size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index)); - pm_strpbrk_explicit_encoding_set(parser, source, width); - - if (width > 0) { - index += width; - } else { - // At this point we know we have an invalid multibyte character. - // We'll walk forward as far as we can until we find the next - // valid character so that we don't spam the user with a ton of - // the same kind of error. - const size_t start = index; - - do { - index++; - } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0); - - pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index); - } - } - } - - return NULL; -} - -/** - * Here we have rolled our own version of strpbrk. The standard library strpbrk - * has undefined behavior when the source string is not null-terminated. We want - * to support strings that are not null-terminated because pm_parse does not - * have the contract that the string is null-terminated. (This is desirable - * because it means the extension can call pm_parse with the result of a call to - * mmap). - * - * The standard library strpbrk also does not support passing a maximum length - * to search. We want to support this for the reason mentioned above, but we - * also don't want it to stop on null bytes. Ruby actually allows null bytes - * within strings, comments, regular expressions, etc. So we need to be able to - * skip past them. - * - * Finally, we want to support encodings wherein the charset could contain - * characters that are trailing bytes of multi-byte characters. For example, in - * Shift_JIS, the backslash character can be a trailing byte. In that case we - * need to take a slower path and iterate one multi-byte character at a time. - */ -const uint8_t * -pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, ptrdiff_t length, bool validate) { - if (length <= 0) { - return NULL; - } else if (!parser->encoding_changed) { - return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate); - } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { - return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate); - } else if (parser->encoding->multibyte) { - return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate); - } else { - return pm_strpbrk_single_byte(parser, source, charset, (size_t) length, validate); - } -} diff --git a/prism/version.h b/prism/version.h index 0a2a8c8fce..181b398462 100644 --- a/prism/version.h +++ b/prism/version.h @@ -6,6 +6,8 @@ #ifndef PRISM_VERSION_H #define PRISM_VERSION_H +#include "prism/compiler/exported.h" + /** * The major version of the Prism library as an int. */ @@ -14,7 +16,7 @@ /** * The minor version of the Prism library as an int. */ -#define PRISM_VERSION_MINOR 4 +#define PRISM_VERSION_MINOR 9 /** * The patch version of the Prism library as an int. @@ -24,6 +26,13 @@ /** * The version of the Prism library as a constant string. */ -#define PRISM_VERSION "1.4.0" +#define PRISM_VERSION "1.9.0" + +/** + * The prism version and the serialization format. + * + * @returns The prism version as a constant string. + */ +PRISM_EXPORTED_FUNCTION const char * pm_version(void); #endif |
