summaryrefslogtreecommitdiff
path: root/prism/parser.h
diff options
context:
space:
mode:
Diffstat (limited to 'prism/parser.h')
-rw-r--r--prism/parser.h940
1 files changed, 286 insertions, 654 deletions
diff --git a/prism/parser.h b/prism/parser.h
index 7f26054f09..2c8c4b3a7a 100644
--- a/prism/parser.h
+++ b/prism/parser.h
@@ -6,711 +6,343 @@
#ifndef PRISM_PARSER_H
#define PRISM_PARSER_H
+#include "prism/compiler/nodiscard.h"
+#include "prism/compiler/nonnull.h"
+
#include "prism/ast.h"
-#include "prism/defines.h"
-#include "prism/encoding.h"
-#include "prism/util/pm_constant_pool.h"
-#include "prism/util/pm_list.h"
-#include "prism/util/pm_newline_list.h"
-#include "prism/util/pm_state_stack.h"
-#include "prism/util/pm_string.h"
-
-#include <stdbool.h>
-
-// TODO: remove this by renaming the original flag
-/**
- * Temporary alias for the PM_NODE_FLAG_STATIC_KEYS flag.
- */
-#define PM_KEYWORD_HASH_NODE_FLAGS_SYMBOL_KEYS PM_KEYWORD_HASH_NODE_FLAGS_STATIC_KEYS
-
-/**
- * This enum provides various bits that represent different kinds of states that
- * the lexer can track. This is used to determine which kind of token to return
- * based on the context of the parser.
- */
-typedef enum {
- PM_LEX_STATE_BIT_BEG,
- PM_LEX_STATE_BIT_END,
- PM_LEX_STATE_BIT_ENDARG,
- PM_LEX_STATE_BIT_ENDFN,
- PM_LEX_STATE_BIT_ARG,
- PM_LEX_STATE_BIT_CMDARG,
- PM_LEX_STATE_BIT_MID,
- PM_LEX_STATE_BIT_FNAME,
- PM_LEX_STATE_BIT_DOT,
- PM_LEX_STATE_BIT_CLASS,
- PM_LEX_STATE_BIT_LABEL,
- PM_LEX_STATE_BIT_LABELED,
- PM_LEX_STATE_BIT_FITEM
-} pm_lex_state_bit_t;
-
-/**
- * This enum combines the various bits from the above enum into individual
- * values that represent the various states of the lexer.
- */
-typedef enum {
- PM_LEX_STATE_NONE = 0,
- PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
- PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
- PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
- PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
- PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
- PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
- PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
- PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
- PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
- PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
- PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
- PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
- PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
- PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
- PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
- PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
-} pm_lex_state_t;
-
-/**
- * The type of quote that a heredoc uses.
- */
-typedef enum {
- PM_HEREDOC_QUOTE_NONE,
- PM_HEREDOC_QUOTE_SINGLE = '\'',
- PM_HEREDOC_QUOTE_DOUBLE = '"',
- PM_HEREDOC_QUOTE_BACKTICK = '`',
-} pm_heredoc_quote_t;
-
-/**
- * The type of indentation that a heredoc uses.
- */
-typedef enum {
- PM_HEREDOC_INDENT_NONE,
- PM_HEREDOC_INDENT_DASH,
- PM_HEREDOC_INDENT_TILDE,
-} pm_heredoc_indent_t;
-
-/**
- * When lexing Ruby source, the lexer has a small amount of state to tell which
- * kind of token it is currently lexing. For example, when we find the start of
- * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
- * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
- * are found as part of a string.
- */
-typedef struct pm_lex_mode {
- /** The type of this lex mode. */
- enum {
- /** This state is used when any given token is being lexed. */
- PM_LEX_DEFAULT,
-
- /**
- * This state is used when we're lexing as normal but inside an embedded
- * expression of a string.
- */
- PM_LEX_EMBEXPR,
-
- /**
- * This state is used when we're lexing a variable that is embedded
- * directly inside of a string with the # shorthand.
- */
- PM_LEX_EMBVAR,
-
- /** This state is used when you are inside the content of a heredoc. */
- PM_LEX_HEREDOC,
-
- /**
- * This state is used when we are lexing a list of tokens, as in a %w
- * word list literal or a %i symbol list literal.
- */
- PM_LEX_LIST,
-
- /**
- * This state is used when a regular expression has been begun and we
- * are looking for the terminator.
- */
- PM_LEX_REGEXP,
-
- /**
- * This state is used when we are lexing a string or a string-like
- * token, as in string content with either quote or an xstring.
- */
- PM_LEX_STRING
- } mode;
-
- /** The data associated with this type of lex mode. */
- union {
- struct {
- /** This keeps track of the nesting level of the list. */
- size_t nesting;
-
- /** Whether or not interpolation is allowed in this list. */
- bool interpolation;
-
- /**
- * When lexing a list, it takes into account balancing the
- * terminator if the terminator is one of (), [], {}, or <>.
- */
- uint8_t incrementor;
-
- /** This is the terminator of the list literal. */
- uint8_t terminator;
-
- /**
- * This is the character set that should be used to delimit the
- * tokens within the list.
- */
- uint8_t breakpoints[11];
- } list;
-
- struct {
- /**
- * This keeps track of the nesting level of the regular expression.
- */
- size_t nesting;
-
- /**
- * When lexing a regular expression, it takes into account balancing
- * the terminator if the terminator is one of (), [], {}, or <>.
- */
- uint8_t incrementor;
-
- /** This is the terminator of the regular expression. */
- uint8_t terminator;
-
- /**
- * This is the character set that should be used to delimit the
- * tokens within the regular expression.
- */
- uint8_t breakpoints[6];
- } regexp;
-
- struct {
- /** This keeps track of the nesting level of the string. */
- size_t nesting;
-
- /** Whether or not interpolation is allowed in this string. */
- bool interpolation;
-
- /**
- * Whether or not at the end of the string we should allow a :,
- * which would indicate this was a dynamic symbol instead of a
- * string.
- */
- bool label_allowed;
-
- /**
- * When lexing a string, it takes into account balancing the
- * terminator if the terminator is one of (), [], {}, or <>.
- */
- uint8_t incrementor;
-
- /**
- * This is the terminator of the string. It is typically either a
- * single or double quote.
- */
- uint8_t terminator;
-
- /**
- * This is the character set that should be used to delimit the
- * tokens within the string.
- */
- uint8_t breakpoints[6];
- } string;
-
- struct {
- /** A pointer to the start of the heredoc identifier. */
- const uint8_t *ident_start;
-
- /** The length of the heredoc identifier. */
- size_t ident_length;
-
- /** The type of quote that the heredoc uses. */
- pm_heredoc_quote_t quote;
-
- /** The type of indentation that the heredoc uses. */
- pm_heredoc_indent_t indent;
-
- /**
- * This is the pointer to the character where lexing should resume
- * once the heredoc has been completely processed.
- */
- const uint8_t *next_start;
-
- /**
- * This is used to track the amount of common whitespace on each
- * line so that we know how much to dedent each line in the case of
- * a tilde heredoc.
- */
- size_t common_whitespace;
- } heredoc;
- } as;
-
- /** The previous lex state so that it knows how to pop. */
- struct pm_lex_mode *prev;
-} pm_lex_mode_t;
-
-/**
- * We pre-allocate a certain number of lex states in order to avoid having to
- * call malloc too many times while parsing. You really shouldn't need more than
- * this because you only really nest deeply when doing string interpolation.
- */
-#define PM_LEX_STACK_SIZE 4
+#include "prism/comments.h"
+#include "prism/diagnostic.h"
+#include "prism/line_offset_list.h"
+#include "prism/magic_comments.h"
+#include "prism/options.h"
/**
* The parser used to parse Ruby source.
*/
-typedef struct pm_parser pm_parser_t;
+typedef struct pm_parser_t pm_parser_t;
/**
- * While parsing, we keep track of a stack of contexts. This is helpful for
- * error recovery so that we can pop back to a previous context when we hit a
- * token that is understood by a parent context but not by the current context.
+ * Allocate and initialize a parser with the given start and end pointers.
+ *
+ * @param arena The arena to use for all AST-lifetime allocations. It is caller-
+ * owned and must outlive the parser.
+ * @param source The source to parse.
+ * @param size The size of the source.
+ * @param options The optional options to use when parsing. These options must
+ * live for the whole lifetime of this parser.
+ * @returns The initialized parser. It is the responsibility of the caller to
+ * free the parser with `pm_parser_free()`.
*/
-typedef enum {
- /** a begin statement */
- PM_CONTEXT_BEGIN,
-
- /** expressions in block arguments using braces */
- PM_CONTEXT_BLOCK_BRACES,
-
- /** expressions in block arguments using do..end */
- PM_CONTEXT_BLOCK_KEYWORDS,
-
- /** a case when statements */
- PM_CONTEXT_CASE_WHEN,
-
- /** a case in statements */
- PM_CONTEXT_CASE_IN,
-
- /** a class declaration */
- PM_CONTEXT_CLASS,
+PRISM_EXPORTED_FUNCTION PRISM_NODISCARD pm_parser_t * pm_parser_new(pm_arena_t *arena, const uint8_t *source, size_t size, const pm_options_t *options) PRISM_NONNULL(1);
- /** a method definition */
- PM_CONTEXT_DEF,
-
- /** a method definition's parameters */
- PM_CONTEXT_DEF_PARAMS,
-
- /** a method definition's default parameter */
- PM_CONTEXT_DEFAULT_PARAMS,
-
- /** an else clause */
- PM_CONTEXT_ELSE,
-
- /** an elsif clause */
- PM_CONTEXT_ELSIF,
-
- /** an interpolated expression */
- PM_CONTEXT_EMBEXPR,
-
- /** an ensure statement */
- PM_CONTEXT_ENSURE,
+/**
+ * Free both the memory held by the given parser and the parser itself.
+ *
+ * @param parser The parser to free.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_free(pm_parser_t *parser) PRISM_NONNULL(1);
- /** an ensure statement within a method definition */
- PM_CONTEXT_ENSURE_DEF,
+/**
+ * When the encoding that is being used to parse the source is changed by prism,
+ * we provide the ability here to call out to a user-defined function.
+ */
+typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
- /** a for loop */
- PM_CONTEXT_FOR,
+/**
+ * This is the callback that is called when a token is lexed. It is passed
+ * the opaque data pointer, the parser, and the token that was lexed.
+ */
+typedef void (*pm_lex_callback_t)(pm_parser_t *parser, pm_token_t *token, void *data);
- /** a for loop's index */
- PM_CONTEXT_FOR_INDEX,
+/**
+ * Register a callback that will be called whenever prism changes the encoding
+ * it is using to parse based on the magic comment.
+ *
+ * @param parser The parser to register the callback with.
+ * @param callback The callback to register.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_encoding_changed_callback_set(pm_parser_t *parser, pm_encoding_changed_callback_t callback) PRISM_NONNULL(1);
- /** an if statement */
- PM_CONTEXT_IF,
+/**
+ * Register a callback that will be called whenever a token is lexed.
+ *
+ * @param parser The parser to register the callback with.
+ * @param data The opaque data to pass to the callback when it is called.
+ * @param callback The callback to register.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_lex_callback_set(pm_parser_t *parser, pm_lex_callback_t callback, void *data) PRISM_NONNULL(1);
- /** a lambda expression with braces */
- PM_CONTEXT_LAMBDA_BRACES,
+/**
+ * Returns the opaque data that is passed to the lex callback when it is called.
+ *
+ * @param parser The parser whose lex callback data we want to get.
+ * @returns The opaque data that is passed to the lex callback when it is called.
+ */
+PRISM_EXPORTED_FUNCTION void * pm_parser_lex_callback_data(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a lambda expression with do..end */
- PM_CONTEXT_LAMBDA_DO_END,
+/**
+ * Returns the raw pointer to the start of the source that is being parsed.
+ *
+ * @param parser the parser whose start pointer we want to get
+ * @returns the raw pointer to the start of the source that is being parsed
+ */
+PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_start(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** the top level context */
- PM_CONTEXT_MAIN,
+/**
+ * Returns the raw pointer to the end of the source that is being parsed.
+ *
+ * @param parser the parser whose end pointer we want to get
+ * @returns the raw pointer to the end of the source that is being parsed
+ */
+PRISM_EXPORTED_FUNCTION const uint8_t * pm_parser_end(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a module declaration */
- PM_CONTEXT_MODULE,
+/**
+ * Returns the line that the parser was considered to have started on.
+ *
+ * @param parser the parser whose start line we want to get
+ * @returns the line that the parser was considered to have started on
+ */
+PRISM_EXPORTED_FUNCTION int32_t pm_parser_start_line(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a parenthesized expression */
- PM_CONTEXT_PARENS,
+/**
+ * Returns the name of the encoding that is being used to parse the source.
+ *
+ * @param parser the parser whose encoding name we want to get
+ * @returns the name of the encoding that is being used to parse the source
+ */
+PRISM_EXPORTED_FUNCTION const char * pm_parser_encoding_name(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** an END block */
- PM_CONTEXT_POSTEXE,
+/**
+ * Returns the width of the character at the given pointer in the encoding that
+ * is being used to parse the source.
+ *
+ * @param parser the parser whose encoding we want to use
+ * @param start a pointer to the start of the character
+ * @param remaining the number of bytes remaining in the source
+ * @returns the width of the character in bytes
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_encoding_char_width(const pm_parser_t *parser, const uint8_t *start, ptrdiff_t remaining) PRISM_NONNULL(1, 2);
- /** a predicate inside an if/elsif/unless statement */
- PM_CONTEXT_PREDICATE,
+/**
+ * Returns whether or not the parser is using the US-ASCII encoding.
+ *
+ * @param parser the parser to check
+ * @returns true if the parser is using US-ASCII encoding, false otherwise
+ */
+PRISM_EXPORTED_FUNCTION bool pm_parser_encoding_us_ascii(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a BEGIN block */
- PM_CONTEXT_PREEXE,
+/**
+ * Returns the filepath that is being used to parse the source.
+ *
+ * @param parser the parser whose filepath we want to get
+ * @returns a pointer to the filepath string
+ */
+PRISM_EXPORTED_FUNCTION const pm_string_t * pm_parser_filepath(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a rescue else statement */
- PM_CONTEXT_RESCUE_ELSE,
+/**
+ * Find a constant in the parser's constant pool. Returns the id of the
+ * constant, or 0 if the constant is not found.
+ *
+ * @param parser the parser whose constant pool we want to search
+ * @param start a pointer to the start of the string to search for
+ * @param length the length of the string to search for
+ * @returns the id of the constant, or 0 if the constant is not found
+ */
+PRISM_EXPORTED_FUNCTION pm_constant_id_t pm_parser_constant_find(const pm_parser_t *parser, const uint8_t *start, size_t length) PRISM_NONNULL(1, 2);
- /** a rescue else statement within a method definition */
- PM_CONTEXT_RESCUE_ELSE_DEF,
+/**
+ * Returns the frozen string literal value of the parser, as determined by the
+ * frozen_string_literal magic comment or the option set on the parser.
+ *
+ * @param parser the parser whose frozen string literal value we want to get
+ * @returns -1 if disabled, 0 if unset, 1 if enabled
+ */
+PRISM_EXPORTED_FUNCTION int8_t pm_parser_frozen_string_literal(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a rescue statement */
- PM_CONTEXT_RESCUE,
+/**
+ * Returns the line offsets that are associated with the given parser.
+ *
+ * @param parser the parser whose line offsets we want to get
+ * @returns the line offsets that are associated with the given parser
+ */
+PRISM_EXPORTED_FUNCTION const pm_line_offset_list_t * pm_parser_line_offsets(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a rescue statement within a method definition */
- PM_CONTEXT_RESCUE_DEF,
+/**
+ * Returns the location of the __DATA__ section that is associated with the
+ * given parser.
+ *
+ * @param parser the parser whose data location we want to get
+ * @returns the location of the __DATA__ section that is associated with the
+ * given parser. If it is unset, then the length will be set to 0.
+ */
+PRISM_EXPORTED_FUNCTION const pm_location_t * pm_parser_data_loc(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a singleton class definition */
- PM_CONTEXT_SCLASS,
+/**
+ * Returns whether the given parser is continuable, meaning that it could become
+ * valid if more input were appended, as opposed to being definitively invalid.
+ *
+ * @param parser the parser whose continuable status we want to get
+ * @returns whether the given parser is continuable
+ */
+PRISM_EXPORTED_FUNCTION bool pm_parser_continuable(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** an unless statement */
- PM_CONTEXT_UNLESS,
+/**
+ * Returns the lex state of the parser. Note that this is an internal detail,
+ * and we are purposefully not returning an instance of the internal enum that
+ * we use to track this. This is only exposed because we need it for some very
+ * niche use cases. Most consumers should avoid this function.
+ *
+ * @param parser the parser whose lex state we want to get
+ * @returns the lex state of the parser
+ */
+PRISM_EXPORTED_FUNCTION int pm_parser_lex_state(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** an until statement */
- PM_CONTEXT_UNTIL,
+/**
+ * Returns the number of comments associated with the given parser.
+ *
+ * @param parser the parser whose comments we want to get the size of
+ * @returns the number of comments associated with the given parser
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** a while statement */
- PM_CONTEXT_WHILE,
-} pm_context_t;
+/**
+ * A callback function that can be used to process comments found while parsing.
+ */
+typedef void (*pm_comment_callback_t)(const pm_comment_t *comment, void *data);
-/** This is a node in a linked list of contexts. */
-typedef struct pm_context_node {
- /** The context that this node represents. */
- pm_context_t context;
+/**
+ * Iterates over the comments associated with the given parser and calls the
+ * given callback for each comment.
+ *
+ * @param parser the parser whose comments we want to iterate over
+ * @param callback the callback function to call for each comment. This function
+ * will be passed a pointer to the comment and the data parameter passed to
+ * this function.
+ * @param data the data to pass to the callback function for each comment. This
+ * can be NULL if no data needs to be passed to the callback function.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_comments_each(const pm_parser_t *parser, pm_comment_callback_t callback, void *data) PRISM_NONNULL(1);
- /** A pointer to the previous context in the linked list. */
- struct pm_context_node *prev;
-} pm_context_node_t;
+/**
+ * Returns the number of magic comments associated with the given parser.
+ *
+ * @param parser the parser whose magic comments we want to get the size of
+ * @returns the number of magic comments associated with the given parser
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_magic_comments_size(const pm_parser_t *parser) PRISM_NONNULL(1);
-/** This is the type of a comment that we've found while parsing. */
-typedef enum {
- PM_COMMENT_INLINE,
- PM_COMMENT_EMBDOC
-} pm_comment_type_t;
+/**
+ * A callback function that can be used to process magic comments found while parsing.
+ */
+typedef void (*pm_magic_comment_callback_t)(const pm_magic_comment_t *magic_comment, void *data);
/**
- * This is a node in the linked list of comments that we've found while parsing.
+ * Iterates over the magic comments associated with the given parser and calls the
+ * given callback for each magic comment.
*
- * @extends pm_list_node_t
+ * @param parser the parser whose magic comments we want to iterate over
+ * @param callback the callback function to call for each magic comment. This
+ * function will be passed a pointer to the magic comment and the data
+ * parameter passed to this function.
+ * @param data the data to pass to the callback function for each magic comment.
+ * This can be NULL if no data needs to be passed to the callback function.
*/
-typedef struct pm_comment {
- /** The embedded base node. */
- pm_list_node_t node;
+PRISM_EXPORTED_FUNCTION void pm_parser_magic_comments_each(const pm_parser_t *parser, pm_magic_comment_callback_t callback, void *data) PRISM_NONNULL(1);
- /** The location of the comment in the source. */
- pm_location_t location;
+/**
+ * Returns the number of errors associated with the given parser.
+ *
+ * @param parser the parser whose errors we want to get the size of
+ * @returns the number of errors associated with the given parser
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_errors_size(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** The type of comment that we've found. */
- pm_comment_type_t type;
-} pm_comment_t;
+/**
+ * Returns the number of warnings associated with the given parser.
+ *
+ * @param parser the parser whose warnings we want to get the size of
+ * @returns the number of warnings associated with the given parser
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_warnings_size(const pm_parser_t *parser) PRISM_NONNULL(1);
/**
- * This is a node in the linked list of magic comments that we've found while
+ * A callback function that can be used to process diagnostics found while
* parsing.
+ */
+typedef void (*pm_diagnostic_callback_t)(const pm_diagnostic_t *diagnostic, void *data);
+
+/**
+ * Iterates over the errors associated with the given parser and calls the
+ * given callback for each error.
*
- * @extends pm_list_node_t
+ * @param parser the parser whose errors we want to iterate over
+ * @param callback the callback function to call for each error. This function
+ * will be passed a pointer to the error and the data parameter passed to
+ * this function.
+ * @param data the data to pass to the callback function for each error. This
+ * can be NULL if no data needs to be passed to the callback function.
*/
-typedef struct {
- /** The embedded base node. */
- pm_list_node_t node;
+PRISM_EXPORTED_FUNCTION void pm_parser_errors_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1);
- /** A pointer to the start of the key in the source. */
- const uint8_t *key_start;
+/**
+ * Iterates over the warnings associated with the given parser and calls the
+ * given callback for each warning.
+ *
+ * @param parser the parser whose warnings we want to iterate over
+ * @param callback the callback function to call for each warning. This function
+ * will be passed a pointer to the warning and the data parameter passed to
+ * this function.
+ * @param data the data to pass to the callback function for each warning. This
+ * can be NULL if no data needs to be passed to the callback function.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_warnings_each(const pm_parser_t *parser, pm_diagnostic_callback_t callback, void *data) PRISM_NONNULL(1);
- /** A pointer to the start of the value in the source. */
- const uint8_t *value_start;
+/**
+ * Returns the number of constants in the constant pool associated with the
+ * given parser.
+ *
+ * @param parser the parser whose constant pool constants we want to get the
+ * size of
+ * @returns the number of constants in the constant pool associated with the
+ * given parser
+ */
+PRISM_EXPORTED_FUNCTION size_t pm_parser_constants_size(const pm_parser_t *parser) PRISM_NONNULL(1);
- /** The length of the key in the source. */
- uint32_t key_length;
+/**
+ * A callback function that can be used to process constants found while
+ * parsing.
+ */
+typedef void (*pm_constant_callback_t)(const pm_constant_t *constant, void *data);
- /** The length of the value in the source. */
- uint32_t value_length;
-} pm_magic_comment_t;
+/**
+ * Iterates over the constants in the constant pool associated with the given
+ * parser and calls the given callback for each constant.
+ *
+ * @param parser the parser whose constants we want to iterate over
+ * @param callback the callback function to call for each constant. This function
+ * will be passed a pointer to the constant and the data parameter passed to
+ * this function.
+ * @param data the data to pass to the callback function for each constant. This
+ * can be NULL if no data needs to be passed to the callback function.
+ */
+PRISM_EXPORTED_FUNCTION void pm_parser_constants_each(const pm_parser_t *parser, pm_constant_callback_t callback, void *data) PRISM_NONNULL(1);
/**
- * When the encoding that is being used to parse the source is changed by prism,
- * we provide the ability here to call out to a user-defined function.
+ * Returns a pointer to the constant at the given id in the constant pool
+ * associated with the given parser.
+ *
+ * @param parser the parser whose constant pool we want to look up from
+ * @param constant_id the id of the constant to look up (1-based)
+ * @returns a pointer to the constant at the given id
*/
-typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
+PRISM_EXPORTED_FUNCTION const pm_constant_t * pm_parser_constant(const pm_parser_t *parser, pm_constant_id_t constant_id) PRISM_NONNULL(1);
/**
- * When you are lexing through a file, the lexer needs all of the information
- * that the parser additionally provides (for example, the local table). So if
- * you want to properly lex Ruby, you need to actually lex it in the context of
- * the parser. In order to provide this functionality, we optionally allow a
- * struct to be attached to the parser that calls back out to a user-provided
- * callback when each token is lexed.
- */
-typedef struct {
- /**
- * This opaque pointer is used to provide whatever information the user
- * deemed necessary to the callback. In our case we use it to pass the array
- * that the tokens get appended into.
- */
- void *data;
-
- /**
- * This is the callback that is called when a token is lexed. It is passed
- * the opaque data pointer, the parser, and the token that was lexed.
- */
- void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
-} pm_lex_callback_t;
-
-/**
- * This struct represents a node in a linked list of scopes. Some scopes can see
- * into their parent scopes, while others cannot.
- */
-typedef struct pm_scope {
- /** The IDs of the locals in the given scope. */
- pm_constant_id_list_t locals;
-
- /** A pointer to the previous scope in the linked list. */
- struct pm_scope *previous;
-
- /**
- * A boolean indicating whether or not this scope can see into its parent.
- * If closed is true, then the scope cannot see into its parent.
- */
- bool closed;
-
- /**
- * A boolean indicating whether or not this scope has explicit parameters.
- * This is necessary to determine whether or not numbered parameters are
- * allowed.
- */
- bool explicit_params;
-
- /**
- * An integer indicating the number of numbered parameters on this scope.
- * This is necessary to determine if child blocks are allowed to use
- * numbered parameters, and to pass information to consumers of the AST
- * about how many numbered parameters exist.
- */
- uint8_t numbered_parameters;
-} pm_scope_t;
-
-/**
- * This struct represents the overall parser. It contains a reference to the
- * source file, as well as pointers that indicate where in the source it's
- * currently parsing. It also contains the most recent and current token that
- * it's considering.
- */
-struct pm_parser {
- /** The current state of the lexer. */
- pm_lex_state_t lex_state;
-
- /** Tracks the current nesting of (), [], and {}. */
- int enclosure_nesting;
-
- /**
- * Used to temporarily track the nesting of enclosures to determine if a {
- * is the beginning of a lambda following the parameters of a lambda.
- */
- int lambda_enclosure_nesting;
-
- /**
- * Used to track the nesting of braces to ensure we get the correct value
- * when we are interpolating blocks with braces.
- */
- int brace_nesting;
-
- /**
- * The stack used to determine if a do keyword belongs to the predicate of a
- * while, until, or for loop.
- */
- pm_state_stack_t do_loop_stack;
-
- /**
- * The stack used to determine if a do keyword belongs to the beginning of a
- * block.
- */
- pm_state_stack_t accepts_block_stack;
-
- /** A stack of lex modes. */
- struct {
- /** The current mode of the lexer. */
- pm_lex_mode_t *current;
-
- /** The stack of lexer modes. */
- pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
-
- /** The current index into the lexer mode stack. */
- size_t index;
- } lex_modes;
-
- /** The pointer to the start of the source. */
- const uint8_t *start;
-
- /** The pointer to the end of the source. */
- const uint8_t *end;
-
- /** The previous token we were considering. */
- pm_token_t previous;
-
- /** The current token we're considering. */
- pm_token_t current;
-
- /**
- * This is a special field set on the parser when we need the parser to jump
- * to a specific location when lexing the next token, as opposed to just
- * using the end of the previous token. Normally this is NULL.
- */
- const uint8_t *next_start;
-
- /**
- * This field indicates the end of a heredoc whose identifier was found on
- * the current line. If another heredoc is found on the same line, then this
- * will be moved forward to the end of that heredoc. If no heredocs are
- * found on a line then this is NULL.
- */
- const uint8_t *heredoc_end;
-
- /** The list of comments that have been found while parsing. */
- pm_list_t comment_list;
-
- /** The list of magic comments that have been found while parsing. */
- pm_list_t magic_comment_list;
-
- /** The optional location of the __END__ keyword and its contents. */
- pm_location_t data_loc;
-
- /** The list of warnings that have been found while parsing. */
- pm_list_t warning_list;
-
- /** The list of errors that have been found while parsing. */
- pm_list_t error_list;
-
- /** The current local scope. */
- pm_scope_t *current_scope;
-
- /** The current parsing context. */
- pm_context_node_t *current_context;
-
- /**
- * The encoding functions for the current file is attached to the parser as
- * it's parsing so that it can change with a magic comment.
- */
- const pm_encoding_t *encoding;
-
- /**
- * When the encoding that is being used to parse the source is changed by
- * prism, we provide the ability here to call out to a user-defined
- * function.
- */
- pm_encoding_changed_callback_t encoding_changed_callback;
-
- /**
- * This pointer indicates where a comment must start if it is to be
- * considered an encoding comment.
- */
- const uint8_t *encoding_comment_start;
-
- /**
- * This is an optional callback that can be attached to the parser that will
- * be called whenever a new token is lexed by the parser.
- */
- pm_lex_callback_t *lex_callback;
-
- /**
- * This is the path of the file being parsed. We use the filepath when
- * constructing SourceFileNodes.
- */
- pm_string_t filepath_string;
-
- /**
- * This constant pool keeps all of the constants defined throughout the file
- * so that we can reference them later.
- */
- pm_constant_pool_t constant_pool;
-
- /** This is the list of newline offsets in the source file. */
- pm_newline_list_t newline_list;
-
- /**
- * We want to add a flag to integer nodes that indicates their base. We only
- * want to parse these once, but we don't have space on the token itself to
- * communicate this information. So we store it here and pass it through
- * when we find tokens that we need it for.
- */
- pm_node_flags_t integer_base;
-
- /**
- * This string is used to pass information from the lexer to the parser. It
- * is particularly necessary because of escape sequences.
- */
- pm_string_t current_string;
-
- /**
- * The line number at the start of the parse. This will be used to offset
- * the line numbers of all of the locations.
- */
- int32_t start_line;
-
- /**
- * When a string-like expression is being lexed, any byte or escape sequence
- * that resolves to a value whose top bit is set (i.e., >= 0x80) will
- * explicitly set the encoding to the same encoding as the source.
- * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
- * resolves to a value whose top bit is set, then the encoding will be
- * explicitly set to UTF-8.
- *
- * The _next_ time this happens, if the encoding that is about to become the
- * explicitly set encoding does not match the previously set explicit
- * encoding, a mixed encoding error will be emitted.
- *
- * When the expression is finished being lexed, the explicit encoding
- * controls the encoding of the expression. For the most part this means
- * that the expression will either be encoded in the source encoding or
- * UTF-8. This holds for all encodings except US-ASCII. If the source is
- * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
- * expression will be encoded as ASCII-8BIT.
- *
- * Note that if the expression is a list, different elements within the same
- * list can have different encodings, so this will get reset between each
- * element. Furthermore all of this only applies to lists that support
- * interpolation, because otherwise escapes that could change the encoding
- * are ignored.
- *
- * At first glance, it may make more sense for this to live on the lexer
- * mode, but we need it here to communicate back to the parser for character
- * literals that do not push a new lexer mode.
- */
- const pm_encoding_t *explicit_encoding;
-
- /** Whether or not we're at the beginning of a command. */
- bool command_start;
-
- /** Whether or not we're currently recovering from a syntax error. */
- bool recovering;
-
- /**
- * Whether or not the encoding has been changed by a magic comment. We use
- * this to provide a fast path for the lexer instead of going through the
- * function pointer.
- */
- bool encoding_changed;
-
- /**
- * This flag indicates that we are currently parsing a pattern matching
- * expression and impacts that calculation of newlines.
- */
- bool pattern_matching_newlines;
-
- /** This flag indicates that we are currently parsing a keyword argument. */
- bool in_keyword_arg;
-
- /** The current parameter name id on parsing its default value. */
- pm_constant_id_t current_param_name;
-
- /**
- * Whether or not the parser has seen a token that has semantic meaning
- * (i.e., a token that is not a comment or whitespace).
- */
- bool semantic_token_seen;
-
- /**
- * Whether or not we have found a frozen_string_literal magic comment with
- * a true value.
- */
- bool frozen_string_literal;
-
- /**
- * Whether or not we should emit warnings. This will be set to false if the
- * consumer of the library specified it, usually because they are parsing
- * when $VERBOSE is nil.
- */
- bool suppress_warnings;
-};
+ * Initiate the parser with the given parser.
+ *
+ * @param parser The parser to use.
+ * @returns The AST representing the source.
+ */
+PRISM_EXPORTED_FUNCTION pm_node_t * pm_parse(pm_parser_t *parser) PRISM_NONNULL(1);
#endif