/** * @file parser.h * * The parser used to parse Ruby source. */ #ifndef PRISM_PARSER_H #define PRISM_PARSER_H #include "prism/ast.h" #include "prism/defines.h" #include "prism/enc/pm_encoding.h" #include "prism/util/pm_constant_pool.h" #include "prism/util/pm_list.h" #include "prism/util/pm_newline_list.h" #include "prism/util/pm_state_stack.h" #include "prism/util/pm_string.h" #include /** * This enum provides various bits that represent different kinds of states that * the lexer can track. This is used to determine which kind of token to return * based on the context of the parser. */ typedef enum { PM_LEX_STATE_BIT_BEG, PM_LEX_STATE_BIT_END, PM_LEX_STATE_BIT_ENDARG, PM_LEX_STATE_BIT_ENDFN, PM_LEX_STATE_BIT_ARG, PM_LEX_STATE_BIT_CMDARG, PM_LEX_STATE_BIT_MID, PM_LEX_STATE_BIT_FNAME, PM_LEX_STATE_BIT_DOT, PM_LEX_STATE_BIT_CLASS, PM_LEX_STATE_BIT_LABEL, PM_LEX_STATE_BIT_LABELED, PM_LEX_STATE_BIT_FITEM } pm_lex_state_bit_t; /** * This enum combines the various bits from the above enum into individual * values that represent the various states of the lexer. */ typedef enum { PM_LEX_STATE_NONE = 0, PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN } pm_lex_state_t; /** * The type of quote that a heredoc uses. */ typedef enum { PM_HEREDOC_QUOTE_NONE, PM_HEREDOC_QUOTE_SINGLE = '\'', PM_HEREDOC_QUOTE_DOUBLE = '"', PM_HEREDOC_QUOTE_BACKTICK = '`', } pm_heredoc_quote_t; /** * The type of indentation that a heredoc uses. */ typedef enum { PM_HEREDOC_INDENT_NONE, PM_HEREDOC_INDENT_DASH, PM_HEREDOC_INDENT_TILDE, } pm_heredoc_indent_t; /** * When lexing Ruby source, the lexer has a small amount of state to tell which * kind of token it is currently lexing. For example, when we find the start of * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that * are found as part of a string. */ typedef struct pm_lex_mode { /** The type of this lex mode. */ enum { /** This state is used when any given token is being lexed. */ PM_LEX_DEFAULT, /** * This state is used when we're lexing as normal but inside an embedded * expression of a string. */ PM_LEX_EMBEXPR, /** * This state is used when we're lexing a variable that is embedded * directly inside of a string with the # shorthand. */ PM_LEX_EMBVAR, /** This state is used when you are inside the content of a heredoc. */ PM_LEX_HEREDOC, /** * This state is used when we are lexing a list of tokens, as in a %w * word list literal or a %i symbol list literal. */ PM_LEX_LIST, /** * This state is used when a regular expression has been begun and we * are looking for the terminator. */ PM_LEX_REGEXP, /** * This state is used when we are lexing a string or a string-like * token, as in string content with either quote or an xstring. */ PM_LEX_STRING } mode; /** The data associated with this type of lex mode. */ union { struct { /** This keeps track of the nesting level of the list. */ size_t nesting; /** Whether or not interpolation is allowed in this list. */ bool interpolation; /** * When lexing a list, it takes into account balancing the * terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /** This is the terminator of the list literal. */ uint8_t terminator; /** * This is the character set that should be used to delimit the * tokens within the list. */ uint8_t breakpoints[11]; } list; struct { /** * This keeps track of the nesting level of the regular expression. */ size_t nesting; /** * When lexing a regular expression, it takes into account balancing * the terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /** This is the terminator of the regular expression. */ uint8_t terminator; /** * This is the character set that should be used to delimit the * tokens within the regular expression. */ uint8_t breakpoints[6]; } regexp; struct { /** This keeps track of the nesting level of the string. */ size_t nesting; /** Whether or not interpolation is allowed in this string. */ bool interpolation; /** * Whether or not at the end of the string we should allow a :, * which would indicate this was a dynamic symbol instead of a * string. */ bool label_allowed; /** * When lexing a string, it takes into account balancing the * terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /** * This is the terminator of the string. It is typically either a * single or double quote. */ uint8_t terminator; /** * This is the character set that should be used to delimit the * tokens within the string. */ uint8_t breakpoints[6]; } string; struct { /** A pointer to the start of the heredoc identifier. */ const uint8_t *ident_start; /** The length of the heredoc identifier. */ size_t ident_length; /** The type of quote that the heredoc uses. */ pm_heredoc_quote_t quote; /** The type of indentation that the heredoc uses. */ pm_heredoc_indent_t indent; /** * This is the pointer to the character where lexing should resume * once the heredoc has been completely processed. */ const uint8_t *next_start; /** * This is used to track the amount of common whitespace on each * line so that we know how much to dedent each line in the case of * a tilde heredoc. */ size_t common_whitespace; } heredoc; } as; /** The previous lex state so that it knows how to pop. */ struct pm_lex_mode *prev; } pm_lex_mode_t; /** * We pre-allocate a certain number of lex states in order to avoid having to * call malloc too many times while parsing. You really shouldn't need more than * this because you only really nest deeply when doing string interpolation. */ #define PM_LEX_STACK_SIZE 4 /** * The parser used to parse Ruby source. */ typedef struct pm_parser pm_parser_t; /** * While parsing, we keep track of a stack of contexts. This is helpful for * error recovery so that we can pop back to a previous context when we hit a * token that is understood by a parent context but not by the current context. */ typedef enum { /** a begin statement */ PM_CONTEXT_BEGIN, /** expressions in block arguments using braces */ PM_CONTEXT_BLOCK_BRACES, /** expressions in block arguments using do..end */ PM_CONTEXT_BLOCK_KEYWORDS, /** a case when statements */ PM_CONTEXT_CASE_WHEN, /** a case in statements */ PM_CONTEXT_CASE_IN, /** a class declaration */ PM_CONTEXT_CLASS, /** a method definition */ PM_CONTEXT_DEF, /** a method definition's parameters */ PM_CONTEXT_DEF_PARAMS, /** a method definition's default parameter */ PM_CONTEXT_DEFAULT_PARAMS, /** an else clause */ PM_CONTEXT_ELSE, /** an elsif clause */ PM_CONTEXT_ELSIF, /** an interpolated expression */ PM_CONTEXT_EMBEXPR, /** an ensure statement */ PM_CONTEXT_ENSURE, /** a for loop */ PM_CONTEXT_FOR, /** a for loop's index */ PM_CONTEXT_FOR_INDEX, /** an if statement */ PM_CONTEXT_IF, /** a lambda expression with braces */ PM_CONTEXT_LAMBDA_BRACES, /** a lambda expression with do..end */ PM_CONTEXT_LAMBDA_DO_END, /** the top level context */ PM_CONTEXT_MAIN, /** a module declaration */ PM_CONTEXT_MODULE, /** a parenthesized expression */ PM_CONTEXT_PARENS, /** an END block */ PM_CONTEXT_POSTEXE, /** a predicate inside an if/elsif/unless statement */ PM_CONTEXT_PREDICATE, /** a BEGIN block */ PM_CONTEXT_PREEXE, /** a rescue else statement */ PM_CONTEXT_RESCUE_ELSE, /** a rescue statement */ PM_CONTEXT_RESCUE, /** a singleton class definition */ PM_CONTEXT_SCLASS, /** an unless statement */ PM_CONTEXT_UNLESS, /** an until statement */ PM_CONTEXT_UNTIL, /** a while statement */ PM_CONTEXT_WHILE, } pm_context_t; /** This is a node in a linked list of contexts. */ typedef struct pm_context_node { /** The context that this node represents. */ pm_context_t context; /** A pointer to the previous context in the linked list. */ struct pm_context_node *prev; } pm_context_node_t; /** This is the type of a comment that we've found while parsing. */ typedef enum { PM_COMMENT_INLINE, PM_COMMENT_EMBDOC, PM_COMMENT___END__ } pm_comment_type_t; /** * This is a node in the linked list of comments that we've found while parsing. * * @extends pm_list_node_t */ typedef struct pm_comment { /** The embedded base node. */ pm_list_node_t node; /** A pointer to the start of the comment in the source. */ const uint8_t *start; /** A pointer to the end of the comment in the source. */ const uint8_t *end; /** The type of comment that we've found. */ pm_comment_type_t type; } pm_comment_t; /** * This is a node in the linked list of magic comments that we've found while * parsing. * * @extends pm_list_node_t */ typedef struct { /** The embedded base node. */ pm_list_node_t node; /** A pointer to the start of the key in the source. */ const uint8_t *key_start; /** A pointer to the start of the value in the source. */ const uint8_t *value_start; /** The length of the key in the source. */ uint32_t key_length; /** The length of the value in the source. */ uint32_t value_length; } pm_magic_comment_t; /** * When the encoding that is being used to parse the source is changed by prism, * we provide the ability here to call out to a user-defined function. */ typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser); /** * When an encoding is encountered that isn't understood by prism, we provide * the ability here to call out to a user-defined function to get an encoding * struct. If the function returns something that isn't NULL, we set that to * our encoding and use it to parse identifiers. */ typedef pm_encoding_t *(*pm_encoding_decode_callback_t)(pm_parser_t *parser, const uint8_t *name, size_t width); /** * When you are lexing through a file, the lexer needs all of the information * that the parser additionally provides (for example, the local table). So if * you want to properly lex Ruby, you need to actually lex it in the context of * the parser. In order to provide this functionality, we optionally allow a * struct to be attached to the parser that calls back out to a user-provided * callback when each token is lexed. */ typedef struct { /** * This opaque pointer is used to provide whatever information the user * deemed necessary to the callback. In our case we use it to pass the array * that the tokens get appended into. */ void *data; /** * This is the callback that is called when a token is lexed. It is passed * the opaque data pointer, the parser, and the token that was lexed. */ void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token); } pm_lex_callback_t; /** * This struct represents a node in a linked list of scopes. Some scopes can see * into their parent scopes, while others cannot. */ typedef struct pm_scope { /** The IDs of the locals in the given scope. */ pm_constant_id_list_t locals; /** A pointer to the previous scope in the linked list. */ struct pm_scope *previous; /** * A boolean indicating whether or not this scope can see into its parent. * If closed is true, then the scope cannot see into its parent. */ bool closed; /** * A boolean indicating whether or not this scope has explicit parameters. * This is necessary to determine whether or not numbered parameters are * allowed. */ bool explicit_params; /** * A boolean indicating whether or not this scope has numbered parameters. * This is necessary to determine if child blocks are allowed to use * numbered parameters. */ bool numbered_params; /** * A transparent scope is a scope that cannot have locals set on itself. * When a local is set on this scope, it will instead be set on the parent * scope's local table. */ bool transparent; } pm_scope_t; /** * This struct represents the overall parser. It contains a reference to the * source file, as well as pointers that indicate where in the source it's * currently parsing. It also contains the most recent and current token that * it's considering. */ struct pm_parser { /** The current state of the lexer. */ pm_lex_state_t lex_state; /** Tracks the current nesting of (), [], and {}. */ int enclosure_nesting; /** * Used to temporarily track the nesting of enclosures to determine if a { * is the beginning of a lambda following the parameters of a lambda. */ int lambda_enclosure_nesting; /** * Used to track the nesting of braces to ensure we get the correct value * when we are interpolating blocks with braces. */ int brace_nesting; /** * The stack used to determine if a do keyword belongs to the predicate of a * while, until, or for loop. */ pm_state_stack_t do_loop_stack; /** * The stack used to determine if a do keyword belongs to the beginning of a * block. */ pm_state_stack_t accepts_block_stack; /** A stack of lex modes. */ struct { /** The current mode of the lexer. */ pm_lex_mode_t *current; /** The stack of lexer modes. */ pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; /** The current index into the lexer mode stack. */ size_t index; } lex_modes; /** The pointer to the start of the source. */ const uint8_t *start; /** The pointer to the end of the source. */ const uint8_t *end; /** The previous token we were considering. */ pm_token_t previous; /** The current token we're considering. */ pm_token_t current; /** * This is a special field set on the parser when we need the parser to jump * to a specific location when lexing the next token, as opposed to just * using the end of the previous token. Normally this is NULL. */ const uint8_t *next_start; /** * This field indicates the end of a heredoc whose identifier was found on * the current line. If another heredoc is found on the same line, then this * will be moved forward to the end of that heredoc. If no heredocs are * found on a line then this is NULL. */ const uint8_t *heredoc_end; /** The list of comments that have been found while parsing. */ pm_list_t comment_list; /** The list of magic comments that have been found while parsing. */ pm_list_t magic_comment_list; /** The list of warnings that have been found while parsing. */ pm_list_t warning_list; /** The list of errors that have been found while parsing. */ pm_list_t error_list; /** The current local scope. */ pm_scope_t *current_scope; /** The current parsing context. */ pm_context_node_t *current_context; /** * The encoding functions for the current file is attached to the parser as * it's parsing so that it can change with a magic comment. */ pm_encoding_t encoding; /** * When the encoding that is being used to parse the source is changed by * prism, we provide the ability here to call out to a user-defined * function. */ pm_encoding_changed_callback_t encoding_changed_callback; /** * When an encoding is encountered that isn't understood by prism, we * provide the ability here to call out to a user-defined function to get an * encoding struct. If the function returns something that isn't NULL, we * set that to our encoding and use it to parse identifiers. */ pm_encoding_decode_callback_t encoding_decode_callback; /** * This pointer indicates where a comment must start if it is to be * considered an encoding comment. */ const uint8_t *encoding_comment_start; /** * This is an optional callback that can be attached to the parser that will * be called whenever a new token is lexed by the parser. */ pm_lex_callback_t *lex_callback; /** * This is the path of the file being parsed. We use the filepath when * constructing SourceFileNodes. */ pm_string_t filepath_string; /** * This constant pool keeps all of the constants defined throughout the file * so that we can reference them later. */ pm_constant_pool_t constant_pool; /** This is the list of newline offsets in the source file. */ pm_newline_list_t newline_list; /** * We want to add a flag to integer nodes that indicates their base. We only * want to parse these once, but we don't have space on the token itself to * communicate this information. So we store it here and pass it through * when we find tokens that we need it for. */ pm_node_flags_t integer_base; /** * This string is used to pass information from the lexer to the parser. It * is particularly necessary because of escape sequences. */ pm_string_t current_string; /** * The line number at the start of the parse. This will be used to offset * the line numbers of all of the locations. */ uint32_t start_line; /** Whether or not we're at the beginning of a command. */ bool command_start; /** Whether or not we're currently recovering from a syntax error. */ bool recovering; /** * Whether or not the encoding has been changed by a magic comment. We use * this to provide a fast path for the lexer instead of going through the * function pointer. */ bool encoding_changed; /** * This flag indicates that we are currently parsing a pattern matching * expression and impacts that calculation of newlines. */ bool pattern_matching_newlines; /** This flag indicates that we are currently parsing a keyword argument. */ bool in_keyword_arg; /** * Whether or not the parser has seen a token that has semantic meaning * (i.e., a token that is not a comment or whitespace). */ bool semantic_token_seen; /** * Whether or not we have found a frozen_string_literal magic comment with * a true value. */ bool frozen_string_literal; /** * Whether or not we should emit warnings. This will be set to false if the * consumer of the library specified it, usually because they are parsing * when $VERBOSE is nil. */ bool suppress_warnings; }; #endif