diff options
Diffstat (limited to 'prism/parser.h')
| -rw-r--r-- | prism/parser.h | 1155 |
1 files changed, 285 insertions, 870 deletions
diff --git a/prism/parser.h b/prism/parser.h index 992729d655..2c8c4b3a7a 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -6,928 +6,343 @@ #ifndef PRISM_PARSER_H #define PRISM_PARSER_H -#include "prism/defines.h" +#include "prism/compiler/nodiscard.h" +#include "prism/compiler/nonnull.h" + #include "prism/ast.h" -#include "prism/encoding.h" +#include "prism/comments.h" +#include "prism/diagnostic.h" +#include "prism/line_offset_list.h" +#include "prism/magic_comments.h" #include "prism/options.h" -#include "prism/static_literals.h" -#include "prism/util/pm_constant_pool.h" -#include "prism/util/pm_list.h" -#include "prism/util/pm_newline_list.h" -#include "prism/util/pm_string.h" - -#include <stdbool.h> - -/** - * This enum provides various bits that represent different kinds of states that - * the lexer can track. This is used to determine which kind of token to return - * based on the context of the parser. - */ -typedef enum { - PM_LEX_STATE_BIT_BEG, - PM_LEX_STATE_BIT_END, - PM_LEX_STATE_BIT_ENDARG, - PM_LEX_STATE_BIT_ENDFN, - PM_LEX_STATE_BIT_ARG, - PM_LEX_STATE_BIT_CMDARG, - PM_LEX_STATE_BIT_MID, - PM_LEX_STATE_BIT_FNAME, - PM_LEX_STATE_BIT_DOT, - PM_LEX_STATE_BIT_CLASS, - PM_LEX_STATE_BIT_LABEL, - PM_LEX_STATE_BIT_LABELED, - PM_LEX_STATE_BIT_FITEM -} pm_lex_state_bit_t; - -/** - * This enum combines the various bits from the above enum into individual - * values that represent the various states of the lexer. - */ -typedef enum { - PM_LEX_STATE_NONE = 0, - PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), - PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), - PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), - PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), - PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), - PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), - PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), - PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), - PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), - PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), - PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), - PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), - PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), - PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, - PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, - PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN -} pm_lex_state_t; - -/** - * The type of quote that a heredoc uses. - */ -typedef enum { - PM_HEREDOC_QUOTE_NONE, - PM_HEREDOC_QUOTE_SINGLE = '\'', - PM_HEREDOC_QUOTE_DOUBLE = '"', - PM_HEREDOC_QUOTE_BACKTICK = '`', -} pm_heredoc_quote_t; - -/** - * The type of indentation that a heredoc uses. - */ -typedef enum { - PM_HEREDOC_INDENT_NONE, - PM_HEREDOC_INDENT_DASH, - PM_HEREDOC_INDENT_TILDE, -} pm_heredoc_indent_t; - -/** - * All of the information necessary to store to lexing a heredoc. - */ -typedef struct { - /** A pointer to the start of the heredoc identifier. */ - const uint8_t *ident_start; - - /** The length of the heredoc identifier. */ - size_t ident_length; - - /** The type of quote that the heredoc uses. */ - pm_heredoc_quote_t quote; - - /** The type of indentation that the heredoc uses. */ - pm_heredoc_indent_t indent; -} pm_heredoc_lex_mode_t; - -/** - * When lexing Ruby source, the lexer has a small amount of state to tell which - * kind of token it is currently lexing. For example, when we find the start of - * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After - * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that - * are found as part of a string. - */ -typedef struct pm_lex_mode { - /** The type of this lex mode. */ - enum { - /** This state is used when any given token is being lexed. */ - PM_LEX_DEFAULT, - - /** - * This state is used when we're lexing as normal but inside an embedded - * expression of a string. - */ - PM_LEX_EMBEXPR, - - /** - * This state is used when we're lexing a variable that is embedded - * directly inside of a string with the # shorthand. - */ - PM_LEX_EMBVAR, - - /** This state is used when you are inside the content of a heredoc. */ - PM_LEX_HEREDOC, - - /** - * This state is used when we are lexing a list of tokens, as in a %w - * word list literal or a %i symbol list literal. - */ - PM_LEX_LIST, - - /** - * This state is used when a regular expression has been begun and we - * are looking for the terminator. - */ - PM_LEX_REGEXP, - - /** - * This state is used when we are lexing a string or a string-like - * token, as in string content with either quote or an xstring. - */ - PM_LEX_STRING - } mode; - - /** The data associated with this type of lex mode. */ - union { - struct { - /** This keeps track of the nesting level of the list. */ - size_t nesting; - - /** Whether or not interpolation is allowed in this list. */ - bool interpolation; - - /** - * When lexing a list, it takes into account balancing the - * terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** This is the terminator of the list literal. */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the list. - */ - uint8_t breakpoints[11]; - } list; - - struct { - /** - * This keeps track of the nesting level of the regular expression. - */ - size_t nesting; - - /** - * When lexing a regular expression, it takes into account balancing - * the terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** This is the terminator of the regular expression. */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the regular expression. - */ - uint8_t breakpoints[7]; - } regexp; - - struct { - /** This keeps track of the nesting level of the string. */ - size_t nesting; - - /** Whether or not interpolation is allowed in this string. */ - bool interpolation; - - /** - * Whether or not at the end of the string we should allow a :, - * which would indicate this was a dynamic symbol instead of a - * string. - */ - bool label_allowed; - - /** - * When lexing a string, it takes into account balancing the - * terminator if the terminator is one of (), [], {}, or <>. - */ - uint8_t incrementor; - - /** - * This is the terminator of the string. It is typically either a - * single or double quote. - */ - uint8_t terminator; - - /** - * This is the character set that should be used to delimit the - * tokens within the string. - */ - uint8_t breakpoints[7]; - } string; - - struct { - /** - * All of the data necessary to lex a heredoc. - */ - pm_heredoc_lex_mode_t base; - - /** - * This is the pointer to the character where lexing should resume - * once the heredoc has been completely processed. - */ - const uint8_t *next_start; - - /** - * This is used to track the amount of common whitespace on each - * line so that we know how much to dedent each line in the case of - * a tilde heredoc. - */ - size_t *common_whitespace; - - /** True if the previous token ended with a line continuation. */ - bool line_continuation; - } heredoc; - } as; - - /** The previous lex state so that it knows how to pop. */ - struct pm_lex_mode *prev; -} pm_lex_mode_t; - -/** - * We pre-allocate a certain number of lex states in order to avoid having to - * call malloc too many times while parsing. You really shouldn't need more than - * this because you only really nest deeply when doing string interpolation. - */ -#define PM_LEX_STACK_SIZE 4 /** * The parser used to parse Ruby source. */ -typedef struct pm_parser pm_parser_t; +typedef struct pm_parser_t pm_parser_t; /** - * While parsing, we keep track of a stack of contexts. This is helpful for - * error recovery so that we can pop back to a previous context when we hit a - * token that is understood by a parent context but not by the current context. + * Allocate and initialize a parser with the given start and end pointers. + * + * @param arena The arena to use for all AST-lifetime allocations. It is caller- + * owned and must outlive the parser. + * @param source The source to parse. + * @param size The size of the source. + * @param options The optional options to use when parsing. These options must + * live for the whole lifetime of this parser. + * @returns The initialized parser. It is the responsibility of the caller to + * free the parser with `pm_parser_free()`. */ -typedef enum { - /** a null context, used for returning a value from a function */ - PM_CONTEXT_NONE = 0, - - /** a begin statement */ - PM_CONTEXT_BEGIN, - - /** an ensure statement with an explicit begin */ - PM_CONTEXT_BEGIN_ENSURE, - - /** a rescue else statement with an explicit begin */ - PM_CONTEXT_BEGIN_ELSE, - - /** a rescue statement with an explicit begin */ - PM_CONTEXT_BEGIN_RESCUE, - - /** expressions in block arguments using braces */ - PM_CONTEXT_BLOCK_BRACES, - - /** expressions in block arguments using do..end */ - PM_CONTEXT_BLOCK_KEYWORDS, - - /** an ensure statement within a do..end block */ - PM_CONTEXT_BLOCK_ENSURE, - - /** a rescue else statement within a do..end block */ - PM_CONTEXT_BLOCK_ELSE, - - /** a rescue statement within a do..end block */ - PM_CONTEXT_BLOCK_RESCUE, - - /** a case when statements */ - PM_CONTEXT_CASE_WHEN, - - /** a case in statements */ - PM_CONTEXT_CASE_IN, - - /** a class declaration */ - PM_CONTEXT_CLASS, - - /** an ensure statement within a class statement */ - PM_CONTEXT_CLASS_ENSURE, - - /** a rescue else statement within a class statement */ - PM_CONTEXT_CLASS_ELSE, - - /** a rescue statement within a class statement */ - PM_CONTEXT_CLASS_RESCUE, - - /** a method definition */ - PM_CONTEXT_DEF, - - /** an ensure statement within a method definition */ - PM_CONTEXT_DEF_ENSURE, - - /** a rescue else statement within a method definition */ - PM_CONTEXT_DEF_ELSE, - - /** a rescue statement within a method definition */ - PM_CONTEXT_DEF_RESCUE, - - /** a method definition's parameters */ - PM_CONTEXT_DEF_PARAMS, - - /** a defined? expression */ - PM_CONTEXT_DEFINED, - - /** a method definition's default parameter */ - PM_CONTEXT_DEFAULT_PARAMS, - - /** an else clause */ - PM_CONTEXT_ELSE, - - /** an elsif clause */ - PM_CONTEXT_ELSIF, - - /** an interpolated expression */ - PM_CONTEXT_EMBEXPR, - - /** a for loop */ - PM_CONTEXT_FOR, - - /** a for loop's index */ - PM_CONTEXT_FOR_INDEX, - - /** an if statement */ - PM_CONTEXT_IF, - - /** a lambda expression with braces */ - PM_CONTEXT_LAMBDA_BRACES, - - /** a lambda expression with do..end */ - PM_CONTEXT_LAMBDA_DO_END, - - /** an ensure statement within a lambda expression */ - PM_CONTEXT_LAMBDA_ENSURE, +PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_parser_t * pm_parser_new(pm_arena_t *arena, const uint8_t *source, size_t size, const pm_options_t *options) PRISM_NONNULL(1); - /** a rescue else statement within a lambda expression */ - PM_CONTEXT_LAMBDA_ELSE, - - /** a rescue statement within a lambda expression */ - PM_CONTEXT_LAMBDA_RESCUE, +/** + * Free both the memory held by the given parser and the parser itself. + * + * @param parser The parser to free. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser) PRISM_NONNULL(1); - /** the predicate clause of a loop statement */ - PM_CONTEXT_LOOP_PREDICATE, +/** + * When the encoding that is being used to parse the source is changed by prism, + * we provide the ability here to call out to a user-defined function. + */ +typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); - /** the top level context */ - PM_CONTEXT_MAIN, +/** + * This is the callback that is called when a token is lexed. It is passed + * the opaque data pointer, the parser, and the token that was lexed. + */ +typedef void (*pm_lex_callback_t)(pm_parser_t *parser, pm_token_t *token, void *data); - /** a module declaration */ - PM_CONTEXT_MODULE, +/** + * Register a callback that will be called whenever prism changes the encoding + * it is using to parse based on the magic comment. + * + * @param parser The parser to register the callback with. + * @param callback The callback to register. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_encoding_changed_callback_set(pm_parser_t *parser, pm_encoding_changed_callback_t callback) PRISM_NONNULL(1); - /** an ensure statement within a module statement */ - PM_CONTEXT_MODULE_ENSURE, +/** + * Register a callback that will be called whenever a token is lexed. + * + * @param parser The parser to register the callback with. + * @param data The opaque data to pass to the callback when it is called. + * @param callback The callback to register. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_lex_callback_set(pm_parser_t *parser, pm_lex_callback_t callback, void *data) PRISM_NONNULL(1); - /** a rescue else statement within a module statement */ - PM_CONTEXT_MODULE_ELSE, +/** + * Returns the opaque data that is passed to the lex callback when it is called. + * + * @param parser The parser whose lex callback data we want to get. + * @returns The opaque data that is passed to the lex callback when it is called. + */ +PRISM_EXPORTED_FUNCTION void * pm_parser_lex_callback_data(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue statement within a module statement */ - PM_CONTEXT_MODULE_RESCUE, +/** + * Returns the raw pointer to the start of the source that is being parsed. + * + * @param parser the parser whose start pointer we want to get + * @returns the raw pointer to the start of the source that is being parsed + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_start(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a multiple target expression */ - PM_CONTEXT_MULTI_TARGET, +/** + * Returns the raw pointer to the end of the source that is being parsed. + * + * @param parser the parser whose end pointer we want to get + * @returns the raw pointer to the end of the source that is being parsed + */ +PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_end(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a parenthesized expression */ - PM_CONTEXT_PARENS, +/** + * Returns the line that the parser was considered to have started on. + * + * @param parser the parser whose start line we want to get + * @returns the line that the parser was considered to have started on + */ +PRISM_EXPORTED_FUNCTION int32_t pm_parser_start_line(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an END block */ - PM_CONTEXT_POSTEXE, +/** + * Returns the name of the encoding that is being used to parse the source. + * + * @param parser the parser whose encoding name we want to get + * @returns the name of the encoding that is being used to parse the source + */ +PRISM_EXPORTED_FUNCTION const char * pm_parser_encoding_name(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a predicate inside an if/elsif/unless statement */ - PM_CONTEXT_PREDICATE, +/** + * Returns the width of the character at the given pointer in the encoding that + * is being used to parse the source. + * + * @param parser the parser whose encoding we want to use + * @param start a pointer to the start of the character + * @param remaining the number of bytes remaining in the source + * @returns the width of the character in bytes + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_encoding_char_width(const pm_parser_t *parser, const uint8_t *start, ptrdiff_t remaining) PRISM_NONNULL(1, 2); - /** a BEGIN block */ - PM_CONTEXT_PREEXE, +/** + * Returns whether or not the parser is using the US-ASCII encoding. + * + * @param parser the parser to check + * @returns true if the parser is using US-ASCII encoding, false otherwise + */ +PRISM_EXPORTED_FUNCTION bool pm_parser_encoding_us_ascii(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a modifier rescue clause */ - PM_CONTEXT_RESCUE_MODIFIER, +/** + * Returns the filepath that is being used to parse the source. + * + * @param parser the parser whose filepath we want to get + * @returns a pointer to the filepath string + */ +PRISM_EXPORTED_FUNCTION const pm_string_t * pm_parser_filepath(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a singleton class definition */ - PM_CONTEXT_SCLASS, +/** + * Find a constant in the parser's constant pool. Returns the id of the + * constant, or 0 if the constant is not found. + * + * @param parser the parser whose constant pool we want to search + * @param start a pointer to the start of the string to search for + * @param length the length of the string to search for + * @returns the id of the constant, or 0 if the constant is not found + */ +PRISM_EXPORTED_FUNCTION pm_constant_id_t pm_parser_constant_find(const pm_parser_t *parser, const uint8_t *start, size_t length) PRISM_NONNULL(1, 2); - /** an ensure statement with a singleton class */ - PM_CONTEXT_SCLASS_ENSURE, +/** + * Returns the frozen string literal value of the parser, as determined by the + * frozen_string_literal magic comment or the option set on the parser. + * + * @param parser the parser whose frozen string literal value we want to get + * @returns -1 if disabled, 0 if unset, 1 if enabled + */ +PRISM_EXPORTED_FUNCTION int8_t pm_parser_frozen_string_literal(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue else statement with a singleton class */ - PM_CONTEXT_SCLASS_ELSE, +/** + * Returns the line offsets that are associated with the given parser. + * + * @param parser the parser whose line offsets we want to get + * @returns the line offsets that are associated with the given parser + */ +PRISM_EXPORTED_FUNCTION const pm_line_offset_list_t * pm_parser_line_offsets(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a rescue statement with a singleton class */ - PM_CONTEXT_SCLASS_RESCUE, +/** + * Returns the location of the __DATA__ section that is associated with the + * given parser. + * + * @param parser the parser whose data location we want to get + * @returns the location of the __DATA__ section that is associated with the + * given parser. If it is unset, then the length will be set to 0. + */ +PRISM_EXPORTED_FUNCTION const pm_location_t * pm_parser_data_loc(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a ternary expression */ - PM_CONTEXT_TERNARY, +/** + * Returns whether the given parser is continuable, meaning that it could become + * valid if more input were appended, as opposed to being definitively invalid. + * + * @param parser the parser whose continuable status we want to get + * @returns whether the given parser is continuable + */ +PRISM_EXPORTED_FUNCTION bool pm_parser_continuable(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an unless statement */ - PM_CONTEXT_UNLESS, +/** + * Returns the lex state of the parser. Note that this is an internal detail, + * and we are purposefully not returning an instance of the internal enum that + * we use to track this. This is only exposed because we need it for some very + * niche use cases. Most consumers should avoid this function. + * + * @param parser the parser whose lex state we want to get + * @returns the lex state of the parser + */ +PRISM_EXPORTED_FUNCTION int pm_parser_lex_state(const pm_parser_t *parser) PRISM_NONNULL(1); - /** an until statement */ - PM_CONTEXT_UNTIL, +/** + * Returns the number of comments associated with the given parser. + * + * @param parser the parser whose comments we want to get the size of + * @returns the number of comments associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** a while statement */ - PM_CONTEXT_WHILE, -} pm_context_t; +/** + * A callback function that can be used to process comments found while parsing. + */ +typedef void (*pm_comment_callback_t)(const pm_comment_t *comment, void *data); -/** This is a node in a linked list of contexts. */ -typedef struct pm_context_node { - /** The context that this node represents. */ - pm_context_t context; +/** + * Iterates over the comments associated with the given parser and calls the + * given callback for each comment. + * + * @param parser the parser whose comments we want to iterate over + * @param callback the callback function to call for each comment. This function + * will be passed a pointer to the comment and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each comment. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_comments_each(const pm_parser_t *parser, pm_comment_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the previous context in the linked list. */ - struct pm_context_node *prev; -} pm_context_node_t; +/** + * Returns the number of magic comments associated with the given parser. + * + * @param parser the parser whose magic comments we want to get the size of + * @returns the number of magic comments associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_magic_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1); -/** This is the type of a comment that we've found while parsing. */ -typedef enum { - PM_COMMENT_INLINE, - PM_COMMENT_EMBDOC -} pm_comment_type_t; +/** + * A callback function that can be used to process magic comments found while parsing. + */ +typedef void (*pm_magic_comment_callback_t)(const pm_magic_comment_t *magic_comment, void *data); /** - * This is a node in the linked list of comments that we've found while parsing. + * Iterates over the magic comments associated with the given parser and calls the + * given callback for each magic comment. * - * @extends pm_list_node_t + * @param parser the parser whose magic comments we want to iterate over + * @param callback the callback function to call for each magic comment. This + * function will be passed a pointer to the magic comment and the data + * parameter passed to this function. + * @param data the data to pass to the callback function for each magic comment. + * This can be NULL if no data needs to be passed to the callback function. */ -typedef struct pm_comment { - /** The embedded base node. */ - pm_list_node_t node; +PRISM_EXPORTED_FUNCTION void pm_parser_magic_comments_each(const pm_parser_t *parser, pm_magic_comment_callback_t callback, void *data) PRISM_NONNULL(1); - /** The location of the comment in the source. */ - pm_location_t location; +/** + * Returns the number of errors associated with the given parser. + * + * @param parser the parser whose errors we want to get the size of + * @returns the number of errors associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_errors_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** The type of comment that we've found. */ - pm_comment_type_t type; -} pm_comment_t; +/** + * Returns the number of warnings associated with the given parser. + * + * @param parser the parser whose warnings we want to get the size of + * @returns the number of warnings associated with the given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_warnings_size(const pm_parser_t *parser) PRISM_NONNULL(1); /** - * This is a node in the linked list of magic comments that we've found while + * A callback function that can be used to process diagnostics found while * parsing. + */ +typedef void (*pm_diagnostic_callback_t)(const pm_diagnostic_t *diagnostic, void *data); + +/** + * Iterates over the errors associated with the given parser and calls the + * given callback for each error. * - * @extends pm_list_node_t + * @param parser the parser whose errors we want to iterate over + * @param callback the callback function to call for each error. This function + * will be passed a pointer to the error and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each error. This + * can be NULL if no data needs to be passed to the callback function. */ -typedef struct { - /** The embedded base node. */ - pm_list_node_t node; +PRISM_EXPORTED_FUNCTION void pm_parser_errors_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the start of the key in the source. */ - const uint8_t *key_start; +/** + * Iterates over the warnings associated with the given parser and calls the + * given callback for each warning. + * + * @param parser the parser whose warnings we want to iterate over + * @param callback the callback function to call for each warning. This function + * will be passed a pointer to the warning and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each warning. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_warnings_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1); - /** A pointer to the start of the value in the source. */ - const uint8_t *value_start; +/** + * Returns the number of constants in the constant pool associated with the + * given parser. + * + * @param parser the parser whose constant pool constants we want to get the + * size of + * @returns the number of constants in the constant pool associated with the + * given parser + */ +PRISM_EXPORTED_FUNCTION size_t pm_parser_constants_size(const pm_parser_t *parser) PRISM_NONNULL(1); - /** The length of the key in the source. */ - uint32_t key_length; +/** + * A callback function that can be used to process constants found while + * parsing. + */ +typedef void (*pm_constant_callback_t)(const pm_constant_t *constant, void *data); - /** The length of the value in the source. */ - uint32_t value_length; -} pm_magic_comment_t; +/** + * Iterates over the constants in the constant pool associated with the given + * parser and calls the given callback for each constant. + * + * @param parser the parser whose constants we want to iterate over + * @param callback the callback function to call for each constant. This function + * will be passed a pointer to the constant and the data parameter passed to + * this function. + * @param data the data to pass to the callback function for each constant. This + * can be NULL if no data needs to be passed to the callback function. + */ +PRISM_EXPORTED_FUNCTION void pm_parser_constants_each(const pm_parser_t *parser, pm_constant_callback_t callback, void *data) PRISM_NONNULL(1); /** - * When the encoding that is being used to parse the source is changed by prism, - * we provide the ability here to call out to a user-defined function. + * Returns a pointer to the constant at the given id in the constant pool + * associated with the given parser. + * + * @param parser the parser whose constant pool we want to look up from + * @param constant_id the id of the constant to look up (1-based) + * @returns a pointer to the constant at the given id */ -typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); +PRISM_EXPORTED_FUNCTION const pm_constant_t * pm_parser_constant(const pm_parser_t *parser, pm_constant_id_t constant_id) PRISM_NONNULL(1); /** - * When you are lexing through a file, the lexer needs all of the information - * that the parser additionally provides (for example, the local table). So if - * you want to properly lex Ruby, you need to actually lex it in the context of - * the parser. In order to provide this functionality, we optionally allow a - * struct to be attached to the parser that calls back out to a user-provided - * callback when each token is lexed. - */ -typedef struct { - /** - * This opaque pointer is used to provide whatever information the user - * deemed necessary to the callback. In our case we use it to pass the array - * that the tokens get appended into. - */ - void *data; - - /** - * This is the callback that is called when a token is lexed. It is passed - * the opaque data pointer, the parser, and the token that was lexed. - */ - void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token); -} pm_lex_callback_t; - -/** The type of shareable constant value that can be set. */ -typedef uint8_t pm_shareable_constant_value_t; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; -static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; - -/** - * This tracks an individual local variable in a certain lexical context, as - * well as the number of times is it read. - */ -typedef struct { - /** The name of the local variable. */ - pm_constant_id_t name; - - /** The location of the local variable in the source. */ - pm_location_t location; - - /** The index of the local variable in the local table. */ - uint32_t index; - - /** The number of times the local variable is read. */ - uint32_t reads; - - /** The hash of the local variable. */ - uint32_t hash; -} pm_local_t; - -/** - * This is a set of local variables in a certain lexical context (method, class, - * module, etc.). We need to track how many times these variables are read in - * order to warn if they only get written. - */ -typedef struct pm_locals { - /** The number of local variables in the set. */ - uint32_t size; - - /** The capacity of the local variables set. */ - uint32_t capacity; - - /** The nullable allocated memory for the local variables in the set. */ - pm_local_t *locals; -} pm_locals_t; - -/** The flags about scope parameters that can be set. */ -typedef uint8_t pm_scope_parameters_t; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; -static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; - -/** - * This struct represents a node in a linked list of scopes. Some scopes can see - * into their parent scopes, while others cannot. - */ -typedef struct pm_scope { - /** A pointer to the previous scope in the linked list. */ - struct pm_scope *previous; - - /** The IDs of the locals in the given scope. */ - pm_locals_t locals; - - /** - * This is a list of the implicit parameters contained within the block. - * These will be processed after the block is parsed to determine the kind - * of parameters node that should be used and to check if any errors need to - * be added. - */ - pm_node_list_t implicit_parameters; - - /** - * This is a bitfield that indicates the parameters that are being used in - * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. - * There are three different kinds of parameters that can be used in a - * scope: - * - * - Ordinary parameters (e.g., def foo(bar); end) - * - Numbered parameters (e.g., def foo; _1; end) - * - The it parameter (e.g., def foo; it; end) - * - * If ordinary parameters are being used, then certain parameters can be - * forwarded to another method/structure. Those are indicated by four - * additional bits in the params field. For example, some combinations of: - * - * - def foo(*); end - * - def foo(**); end - * - def foo(&); end - * - def foo(...); end - */ - pm_scope_parameters_t parameters; - - /** - * The current state of constant shareability for this scope. This is - * changed by magic shareable_constant_value comments. - */ - pm_shareable_constant_value_t shareable_constant; - - /** - * A boolean indicating whether or not this scope can see into its parent. - * If closed is true, then the scope cannot see into its parent. - */ - bool closed; -} pm_scope_t; - -/** - * A struct that represents a stack of boolean values. - */ -typedef uint32_t pm_state_stack_t; - -/** - * This struct represents the overall parser. It contains a reference to the - * source file, as well as pointers that indicate where in the source it's - * currently parsing. It also contains the most recent and current token that - * it's considering. - */ -struct pm_parser { - /** - * The next node identifier that will be assigned. This is a unique - * identifier used to track nodes such that the syntax tree can be dropped - * but the node can be found through another parse. - */ - uint32_t node_id; - - /** The current state of the lexer. */ - pm_lex_state_t lex_state; - - /** Tracks the current nesting of (), [], and {}. */ - int enclosure_nesting; - - /** - * Used to temporarily track the nesting of enclosures to determine if a { - * is the beginning of a lambda following the parameters of a lambda. - */ - int lambda_enclosure_nesting; - - /** - * Used to track the nesting of braces to ensure we get the correct value - * when we are interpolating blocks with braces. - */ - int brace_nesting; - - /** - * The stack used to determine if a do keyword belongs to the predicate of a - * while, until, or for loop. - */ - pm_state_stack_t do_loop_stack; - - /** - * The stack used to determine if a do keyword belongs to the beginning of a - * block. - */ - pm_state_stack_t accepts_block_stack; - - /** A stack of lex modes. */ - struct { - /** The current mode of the lexer. */ - pm_lex_mode_t *current; - - /** The stack of lexer modes. */ - pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; - - /** The current index into the lexer mode stack. */ - size_t index; - } lex_modes; - - /** The pointer to the start of the source. */ - const uint8_t *start; - - /** The pointer to the end of the source. */ - const uint8_t *end; - - /** The previous token we were considering. */ - pm_token_t previous; - - /** The current token we're considering. */ - pm_token_t current; - - /** - * This is a special field set on the parser when we need the parser to jump - * to a specific location when lexing the next token, as opposed to just - * using the end of the previous token. Normally this is NULL. - */ - const uint8_t *next_start; - - /** - * This field indicates the end of a heredoc whose identifier was found on - * the current line. If another heredoc is found on the same line, then this - * will be moved forward to the end of that heredoc. If no heredocs are - * found on a line then this is NULL. - */ - const uint8_t *heredoc_end; - - /** The list of comments that have been found while parsing. */ - pm_list_t comment_list; - - /** The list of magic comments that have been found while parsing. */ - pm_list_t magic_comment_list; - - /** - * An optional location that represents the location of the __END__ marker - * and the rest of the content of the file. This content is loaded into the - * DATA constant when the file being parsed is the main file being executed. - */ - pm_location_t data_loc; - - /** The list of warnings that have been found while parsing. */ - pm_list_t warning_list; - - /** The list of errors that have been found while parsing. */ - pm_list_t error_list; - - /** The current local scope. */ - pm_scope_t *current_scope; - - /** The current parsing context. */ - pm_context_node_t *current_context; - - /** - * The hash keys for the hash that is currently being parsed. This is not - * usually necessary because it can pass it down the various call chains, - * but in the event that you're parsing a hash that is being directly - * pushed into another hash with **, we need to share the hash keys so that - * we can warn for the nested hash as well. - */ - pm_static_literals_t *current_hash_keys; - - /** - * The encoding functions for the current file is attached to the parser as - * it's parsing so that it can change with a magic comment. - */ - const pm_encoding_t *encoding; - - /** - * When the encoding that is being used to parse the source is changed by - * prism, we provide the ability here to call out to a user-defined - * function. - */ - pm_encoding_changed_callback_t encoding_changed_callback; - - /** - * This pointer indicates where a comment must start if it is to be - * considered an encoding comment. - */ - const uint8_t *encoding_comment_start; - - /** - * This is an optional callback that can be attached to the parser that will - * be called whenever a new token is lexed by the parser. - */ - pm_lex_callback_t *lex_callback; - - /** - * This is the path of the file being parsed. We use the filepath when - * constructing SourceFileNodes. - */ - pm_string_t filepath; - - /** - * This constant pool keeps all of the constants defined throughout the file - * so that we can reference them later. - */ - pm_constant_pool_t constant_pool; - - /** This is the list of newline offsets in the source file. */ - pm_newline_list_t newline_list; - - /** - * We want to add a flag to integer nodes that indicates their base. We only - * want to parse these once, but we don't have space on the token itself to - * communicate this information. So we store it here and pass it through - * when we find tokens that we need it for. - */ - pm_node_flags_t integer_base; - - /** - * This string is used to pass information from the lexer to the parser. It - * is particularly necessary because of escape sequences. - */ - pm_string_t current_string; - - /** - * The line number at the start of the parse. This will be used to offset - * the line numbers of all of the locations. - */ - int32_t start_line; - - /** - * When a string-like expression is being lexed, any byte or escape sequence - * that resolves to a value whose top bit is set (i.e., >= 0x80) will - * explicitly set the encoding to the same encoding as the source. - * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that - * resolves to a value whose top bit is set, then the encoding will be - * explicitly set to UTF-8. - * - * The _next_ time this happens, if the encoding that is about to become the - * explicitly set encoding does not match the previously set explicit - * encoding, a mixed encoding error will be emitted. - * - * When the expression is finished being lexed, the explicit encoding - * controls the encoding of the expression. For the most part this means - * that the expression will either be encoded in the source encoding or - * UTF-8. This holds for all encodings except US-ASCII. If the source is - * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the - * expression will be encoded as ASCII-8BIT. - * - * Note that if the expression is a list, different elements within the same - * list can have different encodings, so this will get reset between each - * element. Furthermore all of this only applies to lists that support - * interpolation, because otherwise escapes that could change the encoding - * are ignored. - * - * At first glance, it may make more sense for this to live on the lexer - * mode, but we need it here to communicate back to the parser for character - * literals that do not push a new lexer mode. - */ - const pm_encoding_t *explicit_encoding; - - /** - * When parsing block exits (e.g., break, next, redo), we need to validate - * that they are in correct contexts. For the most part we can do this by - * looking at our parent contexts. However, modifier while and until - * expressions can change that context to make block exits valid. In these - * cases, we need to keep track of the block exits and then validate them - * after the expression has been parsed. - * - * We use a pointer here because we don't want to keep a whole list attached - * since this will only be used in the context of begin/end expressions. - */ - pm_node_list_t *current_block_exits; - - /** The version of prism that we should use to parse. */ - pm_options_version_t version; - - /** The command line flags given from the options. */ - uint8_t command_line; - - /** - * Whether or not we have found a frozen_string_literal magic comment with - * a true or false value. - * May be: - * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED - * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET - */ - int8_t frozen_string_literal; - - /** - * Whether or not we are parsing an eval string. This impacts whether or not - * we should evaluate if block exits/yields are valid. - */ - bool parsing_eval; - - /** - * Whether or not we are parsing a "partial" script, which is a script that - * will be evaluated in the context of another script, so we should not - * check jumps (next/break/etc.) for validity. - */ - bool partial_script; - - /** Whether or not we're at the beginning of a command. */ - bool command_start; - - /** Whether or not we're currently recovering from a syntax error. */ - bool recovering; - - /** - * This is very specialized behavior for when you want to parse in a context - * that does not respect encoding comments. Its main use case is translating - * into the whitequark/parser AST which re-encodes source files in UTF-8 - * before they are parsed and ignores encoding comments. - */ - bool encoding_locked; - - /** - * Whether or not the encoding has been changed by a magic comment. We use - * this to provide a fast path for the lexer instead of going through the - * function pointer. - */ - bool encoding_changed; - - /** - * This flag indicates that we are currently parsing a pattern matching - * expression and impacts that calculation of newlines. - */ - bool pattern_matching_newlines; - - /** This flag indicates that we are currently parsing a keyword argument. */ - bool in_keyword_arg; - - /** - * Whether or not the parser has seen a token that has semantic meaning - * (i.e., a token that is not a comment or whitespace). - */ - bool semantic_token_seen; - - /** - * True if the current regular expression being lexed contains only ASCII - * characters. - */ - bool current_regular_expression_ascii_only; - - /** - * By default, Ruby always warns about mismatched indentation. This can be - * toggled with a magic comment. - */ - bool warn_mismatched_indentation; -}; + * Initiate the parser with the given parser. + * + * @param parser The parser to use. + * @returns The AST representing the source. + */ +PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser) PRISM_NONNULL(1); #endif |
