#ifndef PRISM_INTERNAL_PARSER_H #define PRISM_INTERNAL_PARSER_H #include "prism/compiler/accel.h" #include "prism/internal/arena.h" #include "prism/internal/constant_pool.h" #include "prism/internal/encoding.h" #include "prism/internal/list.h" #include "prism/internal/options.h" #include "prism/internal/static_literals.h" #include "prism/ast.h" #include "prism/line_offset_list.h" #include "prism/parser.h" #include #include #include /* * This enum provides various bits that represent different kinds of states that * the lexer can track. This is used to determine which kind of token to return * based on the context of the parser. */ typedef enum { PM_LEX_STATE_BIT_BEG, PM_LEX_STATE_BIT_END, PM_LEX_STATE_BIT_ENDARG, PM_LEX_STATE_BIT_ENDFN, PM_LEX_STATE_BIT_ARG, PM_LEX_STATE_BIT_CMDARG, PM_LEX_STATE_BIT_MID, PM_LEX_STATE_BIT_FNAME, PM_LEX_STATE_BIT_DOT, PM_LEX_STATE_BIT_CLASS, PM_LEX_STATE_BIT_LABEL, PM_LEX_STATE_BIT_LABELED, PM_LEX_STATE_BIT_FITEM } pm_lex_state_bit_t; /* * This enum combines the various bits from the above enum into individual * values that represent the various states of the lexer. */ typedef enum { PM_LEX_STATE_NONE = 0, PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG), PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END), PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG), PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN), PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG), PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG), PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID), PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME), PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT), PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS), PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL), PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED), PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM), PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS, PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG, PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN } pm_lex_state_t; /* * The type of quote that a heredoc uses. */ typedef enum { PM_HEREDOC_QUOTE_NONE, PM_HEREDOC_QUOTE_SINGLE = '\'', PM_HEREDOC_QUOTE_DOUBLE = '"', PM_HEREDOC_QUOTE_BACKTICK = '`', } pm_heredoc_quote_t; /* * The type of indentation that a heredoc uses. */ typedef enum { PM_HEREDOC_INDENT_NONE, PM_HEREDOC_INDENT_DASH, PM_HEREDOC_INDENT_TILDE, } pm_heredoc_indent_t; /* * All of the information necessary to store to lexing a heredoc. */ typedef struct { /* A pointer to the start of the heredoc identifier. */ const uint8_t *ident_start; /* The length of the heredoc identifier. */ size_t ident_length; /* The type of quote that the heredoc uses. */ pm_heredoc_quote_t quote; /* The type of indentation that the heredoc uses. */ pm_heredoc_indent_t indent; } pm_heredoc_lex_mode_t; /* * When lexing Ruby source, the lexer has a small amount of state to tell which * kind of token it is currently lexing. For example, when we find the start of * a string, the first token that we return is a TOKEN_STRING_BEGIN token. After * that the lexer is now in the PM_LEX_STRING mode, and will return tokens that * are found as part of a string. */ typedef struct pm_lex_mode { /* The type of this lex mode. */ enum { /* This state is used when any given token is being lexed. */ PM_LEX_DEFAULT, /* * This state is used when we're lexing as normal but inside an embedded * expression of a string. */ PM_LEX_EMBEXPR, /* * This state is used when we're lexing a variable that is embedded * directly inside of a string with the # shorthand. */ PM_LEX_EMBVAR, /* This state is used when you are inside the content of a heredoc. */ PM_LEX_HEREDOC, /* * This state is used when we are lexing a list of tokens, as in a %w * word list literal or a %i symbol list literal. */ PM_LEX_LIST, /* * This state is used when a regular expression has been begun and we * are looking for the terminator. */ PM_LEX_REGEXP, /* * This state is used when we are lexing a string or a string-like * token, as in string content with either quote or an xstring. */ PM_LEX_STRING } mode; /* The data associated with this type of lex mode. */ union { struct { /* This keeps track of the nesting level of the list. */ size_t nesting; /* Whether or not interpolation is allowed in this list. */ bool interpolation; /* * When lexing a list, it takes into account balancing the * terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /* This is the terminator of the list literal. */ uint8_t terminator; /* * This is the character set that should be used to delimit the * tokens within the list. */ uint8_t breakpoints[11]; } list; struct { /* * This keeps track of the nesting level of the regular expression. */ size_t nesting; /* * When lexing a regular expression, it takes into account balancing * the terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /* This is the terminator of the regular expression. */ uint8_t terminator; /* * This is the character set that should be used to delimit the * tokens within the regular expression. */ uint8_t breakpoints[7]; } regexp; struct { /* This keeps track of the nesting level of the string. */ size_t nesting; /* Whether or not interpolation is allowed in this string. */ bool interpolation; /* * Whether or not at the end of the string we should allow a :, * which would indicate this was a dynamic symbol instead of a * string. */ bool label_allowed; /* * When lexing a string, it takes into account balancing the * terminator if the terminator is one of (), [], {}, or <>. */ uint8_t incrementor; /* * This is the terminator of the string. It is typically either a * single or double quote. */ uint8_t terminator; /* * This is the character set that should be used to delimit the * tokens within the string. */ uint8_t breakpoints[7]; } string; struct { /* * All of the data necessary to lex a heredoc. */ pm_heredoc_lex_mode_t base; /* * This is the pointer to the character where lexing should resume * once the heredoc has been completely processed. */ const uint8_t *next_start; /* * This is used to track the amount of common whitespace on each * line so that we know how much to dedent each line in the case of * a tilde heredoc. */ size_t *common_whitespace; /* True if the previous token ended with a line continuation. */ bool line_continuation; } heredoc; } as; /* The previous lex state so that it knows how to pop. */ struct pm_lex_mode *prev; } pm_lex_mode_t; /* * We pre-allocate a certain number of lex states in order to avoid having to * call malloc too many times while parsing. You really shouldn't need more than * this because you only really nest deeply when doing string interpolation. */ #define PM_LEX_STACK_SIZE 4 /* * While parsing, we keep track of a stack of contexts. This is helpful for * error recovery so that we can pop back to a previous context when we hit a * token that is understood by a parent context but not by the current context. */ typedef enum { /* a null context, used for returning a value from a function */ PM_CONTEXT_NONE = 0, /* a begin statement */ PM_CONTEXT_BEGIN, /* an ensure statement with an explicit begin */ PM_CONTEXT_BEGIN_ENSURE, /* a rescue else statement with an explicit begin */ PM_CONTEXT_BEGIN_ELSE, /* a rescue statement with an explicit begin */ PM_CONTEXT_BEGIN_RESCUE, /* expressions in block arguments using braces */ PM_CONTEXT_BLOCK_BRACES, /* expressions in block arguments using do..end */ PM_CONTEXT_BLOCK_KEYWORDS, /* an ensure statement within a do..end block */ PM_CONTEXT_BLOCK_ENSURE, /* a rescue else statement within a do..end block */ PM_CONTEXT_BLOCK_ELSE, /* expressions in block parameters `foo do |...| end ` */ PM_CONTEXT_BLOCK_PARAMETERS, /* a rescue statement within a do..end block */ PM_CONTEXT_BLOCK_RESCUE, /* a case when statements */ PM_CONTEXT_CASE_WHEN, /* a case in statements */ PM_CONTEXT_CASE_IN, /* a class declaration */ PM_CONTEXT_CLASS, /* an ensure statement within a class statement */ PM_CONTEXT_CLASS_ENSURE, /* a rescue else statement within a class statement */ PM_CONTEXT_CLASS_ELSE, /* a rescue statement within a class statement */ PM_CONTEXT_CLASS_RESCUE, /* a method definition */ PM_CONTEXT_DEF, /* an ensure statement within a method definition */ PM_CONTEXT_DEF_ENSURE, /* a rescue else statement within a method definition */ PM_CONTEXT_DEF_ELSE, /* a rescue statement within a method definition */ PM_CONTEXT_DEF_RESCUE, /* a method definition's parameters */ PM_CONTEXT_DEF_PARAMS, /* a defined? expression */ PM_CONTEXT_DEFINED, /* a method definition's default parameter */ PM_CONTEXT_DEFAULT_PARAMS, /* an else clause */ PM_CONTEXT_ELSE, /* an elsif clause */ PM_CONTEXT_ELSIF, /* an interpolated expression */ PM_CONTEXT_EMBEXPR, /* a for loop */ PM_CONTEXT_FOR, /* a for loop's index */ PM_CONTEXT_FOR_INDEX, /* an if statement */ PM_CONTEXT_IF, /* a lambda expression with braces */ PM_CONTEXT_LAMBDA_BRACES, /* a lambda expression with do..end */ PM_CONTEXT_LAMBDA_DO_END, /* an ensure statement within a lambda expression */ PM_CONTEXT_LAMBDA_ENSURE, /* a rescue else statement within a lambda expression */ PM_CONTEXT_LAMBDA_ELSE, /* a rescue statement within a lambda expression */ PM_CONTEXT_LAMBDA_RESCUE, /* the predicate clause of a loop statement */ PM_CONTEXT_LOOP_PREDICATE, /* the top level context */ PM_CONTEXT_MAIN, /* a module declaration */ PM_CONTEXT_MODULE, /* an ensure statement within a module statement */ PM_CONTEXT_MODULE_ENSURE, /* a rescue else statement within a module statement */ PM_CONTEXT_MODULE_ELSE, /* a rescue statement within a module statement */ PM_CONTEXT_MODULE_RESCUE, /* a multiple target expression */ PM_CONTEXT_MULTI_TARGET, /* a parenthesized expression */ PM_CONTEXT_PARENS, /* an END block */ PM_CONTEXT_POSTEXE, /* a predicate inside an if/elsif/unless statement */ PM_CONTEXT_PREDICATE, /* a BEGIN block */ PM_CONTEXT_PREEXE, /* a modifier rescue clause */ PM_CONTEXT_RESCUE_MODIFIER, /* a singleton class definition */ PM_CONTEXT_SCLASS, /* an ensure statement with a singleton class */ PM_CONTEXT_SCLASS_ENSURE, /* a rescue else statement with a singleton class */ PM_CONTEXT_SCLASS_ELSE, /* a rescue statement with a singleton class */ PM_CONTEXT_SCLASS_RESCUE, /* a ternary expression */ PM_CONTEXT_TERNARY, /* an unless statement */ PM_CONTEXT_UNLESS, /* an until statement */ PM_CONTEXT_UNTIL, /* a while statement */ PM_CONTEXT_WHILE, } pm_context_t; /* This is a node in a linked list of contexts. */ typedef struct pm_context_node { /* The context that this node represents. */ pm_context_t context; /* A pointer to the previous context in the linked list. */ struct pm_context_node *prev; } pm_context_node_t; /* The type of shareable constant value that can be set. */ typedef uint8_t pm_shareable_constant_value_t; static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_NONE = 0x0; static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_LITERAL = PM_SHAREABLE_CONSTANT_NODE_FLAGS_LITERAL; static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_EVERYTHING = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_EVERYTHING; static const pm_shareable_constant_value_t PM_SCOPE_SHAREABLE_CONSTANT_EXPERIMENTAL_COPY = PM_SHAREABLE_CONSTANT_NODE_FLAGS_EXPERIMENTAL_COPY; /* * This tracks an individual local variable in a certain lexical context, as * well as the number of times is it read. */ typedef struct { /* The name of the local variable. */ pm_constant_id_t name; /* The location of the local variable in the source. */ pm_location_t location; /* The index of the local variable in the local table. */ uint32_t index; /* The number of times the local variable is read. */ uint32_t reads; /* The hash of the local variable. */ uint32_t hash; } pm_local_t; /* * This is a set of local variables in a certain lexical context (method, class, * module, etc.). We need to track how many times these variables are read in * order to warn if they only get written. */ typedef struct pm_locals { /* The number of local variables in the set. */ uint32_t size; /* The capacity of the local variables set. */ uint32_t capacity; /* * A bloom filter over constant IDs stored in this set. Used to quickly * reject lookups for names that are definitely not present, avoiding the * cost of a linear scan or hash probe. */ uint32_t bloom; /* The nullable allocated memory for the local variables in the set. */ pm_local_t *locals; } pm_locals_t; /* The flags about scope parameters that can be set. */ typedef uint8_t pm_scope_parameters_t; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NONE = 0x0; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_POSITIONALS = 0x1; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_KEYWORDS = 0x2; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_BLOCK = 0x4; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_FORWARDING_ALL = 0x8; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_IMPLICIT_DISALLOWED = 0x10; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_INNER = 0x20; static const pm_scope_parameters_t PM_SCOPE_PARAMETERS_NUMBERED_FOUND = 0x40; /* * This struct represents a node in a linked list of scopes. Some scopes can see * into their parent scopes, while others cannot. */ typedef struct pm_scope { /* A pointer to the previous scope in the linked list. */ struct pm_scope *previous; /* The IDs of the locals in the given scope. */ pm_locals_t locals; /* * This is a list of the implicit parameters contained within the block. * These will be processed after the block is parsed to determine the kind * of parameters node that should be used and to check if any errors need to * be added. */ pm_node_list_t implicit_parameters; /* * This is a bitfield that indicates the parameters that are being used in * this scope. It is a combination of the PM_SCOPE_PARAMETERS_* constants. * There are three different kinds of parameters that can be used in a * scope: * * - Ordinary parameters (e.g., def foo(bar); end) * - Numbered parameters (e.g., def foo; _1; end) * - The it parameter (e.g., def foo; it; end) * * If ordinary parameters are being used, then certain parameters can be * forwarded to another method/structure. Those are indicated by four * additional bits in the params field. For example, some combinations of: * * - def foo(*); end * - def foo(**); end * - def foo(&); end * - def foo(...); end */ pm_scope_parameters_t parameters; /* * The current state of constant shareability for this scope. This is * changed by magic shareable_constant_value comments. */ pm_shareable_constant_value_t shareable_constant; /* * A boolean indicating whether or not this scope can see into its parent. * If closed is true, then the scope cannot see into its parent. */ bool closed; } pm_scope_t; /* * A struct that represents a stack of boolean values. */ typedef uint32_t pm_state_stack_t; /* * This struct represents the overall parser. It contains a reference to the * source file, as well as pointers that indicate where in the source it's * currently parsing. It also contains the most recent and current token that * it's considering. */ struct pm_parser_t { /* The arena used for all AST-lifetime allocations. Caller-owned. */ pm_arena_t *arena; /* The arena used for parser metadata (comments, diagnostics, etc.). */ pm_arena_t metadata_arena; /* * The next node identifier that will be assigned. This is a unique * identifier used to track nodes such that the syntax tree can be dropped * but the node can be found through another parse. */ uint32_t node_id; /* * A single-entry cache for pm_parser_constant_id_raw. Avoids redundant * constant pool lookups when the same token is resolved multiple times * (e.g., once during lexing for local variable detection, and again * during parsing for node creation). */ struct { const uint8_t *start; const uint8_t *end; pm_constant_id_t id; } constant_cache; /* The current state of the lexer. */ pm_lex_state_t lex_state; /* Tracks the current nesting of (), [], and {}. */ int enclosure_nesting; /* * Used to temporarily track the nesting of enclosures to determine if a { * is the beginning of a lambda following the parameters of a lambda. */ int lambda_enclosure_nesting; /* * Used to track the nesting of braces to ensure we get the correct value * when we are interpolating blocks with braces. */ int brace_nesting; /* * The stack used to determine if a do keyword belongs to the predicate of a * while, until, or for loop. */ pm_state_stack_t do_loop_stack; /* * The stack used to determine if a do keyword belongs to the beginning of a * block. */ pm_state_stack_t accepts_block_stack; /* A stack of lex modes. */ struct { /* The current mode of the lexer. */ pm_lex_mode_t *current; /* The stack of lexer modes. */ pm_lex_mode_t stack[PM_LEX_STACK_SIZE]; /* The current index into the lexer mode stack. */ size_t index; } lex_modes; /* The pointer to the start of the source. */ const uint8_t *start; /* The pointer to the end of the source. */ const uint8_t *end; /* The previous token we were considering. */ pm_token_t previous; /* The current token we're considering. */ pm_token_t current; /* * This is a special field set on the parser when we need the parser to jump * to a specific location when lexing the next token, as opposed to just * using the end of the previous token. Normally this is NULL. */ const uint8_t *next_start; /* * This field indicates the end of a heredoc whose identifier was found on * the current line. If another heredoc is found on the same line, then this * will be moved forward to the end of that heredoc. If no heredocs are * found on a line then this is NULL. */ const uint8_t *heredoc_end; /* The list of comments that have been found while parsing. */ pm_list_t comment_list; /* The list of magic comments that have been found while parsing. */ pm_list_t magic_comment_list; /* * An optional location that represents the location of the __END__ marker * and the rest of the content of the file. This content is loaded into the * DATA constant when the file being parsed is the main file being executed. */ pm_location_t data_loc; /* The list of warnings that have been found while parsing. */ pm_list_t warning_list; /* The list of errors that have been found while parsing. */ pm_list_t error_list; /* The current local scope. */ pm_scope_t *current_scope; /* The current parsing context. */ pm_context_node_t *current_context; /* * The hash keys for the hash that is currently being parsed. This is not * usually necessary because it can pass it down the various call chains, * but in the event that you're parsing a hash that is being directly * pushed into another hash with **, we need to share the hash keys so that * we can warn for the nested hash as well. */ pm_static_literals_t *current_hash_keys; /* * The encoding functions for the current file is attached to the parser as * it's parsing so that it can change with a magic comment. */ const pm_encoding_t *encoding; /* * When the encoding that is being used to parse the source is changed by * prism, we provide the ability here to call out to a user-defined * function. */ pm_encoding_changed_callback_t encoding_changed_callback; /* * This pointer indicates where a comment must start if it is to be * considered an encoding comment. */ const uint8_t *encoding_comment_start; /* * When you are lexing through a file, the lexer needs all of the information * that the parser additionally provides (for example, the local table). So if * you want to properly lex Ruby, you need to actually lex it in the context of * the parser. In order to provide this functionality, we optionally allow a * struct to be attached to the parser that calls back out to a user-provided * callback when each token is lexed. */ struct { /* * This is the callback that is called when a token is lexed. It is * passed the opaque data pointer, the parser, and the token that was * lexed. */ pm_lex_callback_t callback; /* * This opaque pointer is used to provide whatever information the user * deemed necessary to the callback. In our case we use it to pass the * array that the tokens get appended into. */ void *data; } lex_callback; /* * This is the path of the file being parsed. We use the filepath when * constructing SourceFileNodes. */ pm_string_t filepath; /* * This constant pool keeps all of the constants defined throughout the file * so that we can reference them later. */ pm_constant_pool_t constant_pool; /* This is the list of line offsets in the source file. */ pm_line_offset_list_t line_offsets; /* * State communicated from the lexer to the parser for integer tokens. */ struct { /* * A flag indicating the base of the integer (binary, octal, decimal, * hexadecimal). Set during lexing and read during node creation. */ pm_node_flags_t base; /* * When lexing a decimal integer that fits in a uint32_t, we compute * the value during lexing to avoid re-scanning the digits during * parsing. If lexed is true, this holds the result and * pm_integer_parse can be skipped. */ uint32_t value; /* Whether value holds a valid pre-computed integer. */ bool lexed; } integer; /* * This string is used to pass information from the lexer to the parser. It * is particularly necessary because of escape sequences. */ pm_string_t current_string; /* * The line number at the start of the parse. This will be used to offset * the line numbers of all of the locations. */ int32_t start_line; /* * When a string-like expression is being lexed, any byte or escape sequence * that resolves to a value whose top bit is set (i.e., >= 0x80) will * explicitly set the encoding to the same encoding as the source. * Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that * resolves to a value whose top bit is set, then the encoding will be * explicitly set to UTF-8. * * The _next_ time this happens, if the encoding that is about to become the * explicitly set encoding does not match the previously set explicit * encoding, a mixed encoding error will be emitted. * * When the expression is finished being lexed, the explicit encoding * controls the encoding of the expression. For the most part this means * that the expression will either be encoded in the source encoding or * UTF-8. This holds for all encodings except US-ASCII. If the source is * US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the * expression will be encoded as ASCII-8BIT. * * Note that if the expression is a list, different elements within the same * list can have different encodings, so this will get reset between each * element. Furthermore all of this only applies to lists that support * interpolation, because otherwise escapes that could change the encoding * are ignored. * * At first glance, it may make more sense for this to live on the lexer * mode, but we need it here to communicate back to the parser for character * literals that do not push a new lexer mode. */ const pm_encoding_t *explicit_encoding; /* * When parsing block exits (e.g., break, next, redo), we need to validate * that they are in correct contexts. For the most part we can do this by * looking at our parent contexts. However, modifier while and until * expressions can change that context to make block exits valid. In these * cases, we need to keep track of the block exits and then validate them * after the expression has been parsed. * * We use a pointer here because we don't want to keep a whole list attached * since this will only be used in the context of begin/end expressions. */ pm_node_list_t *current_block_exits; /* The version of prism that we should use to parse. */ pm_options_version_t version; /* The command line flags given from the options. */ uint8_t command_line; /* * Whether or not we have found a frozen_string_literal magic comment with * a true or false value. * May be: * - PM_OPTIONS_FROZEN_STRING_LITERAL_DISABLED * - PM_OPTIONS_FROZEN_STRING_LITERAL_ENABLED * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET */ int8_t frozen_string_literal; /* * Whether or not we are parsing an eval string. This impacts whether or not * we should evaluate if block exits/yields are valid. */ bool parsing_eval; /* * Whether or not we are parsing a "partial" script, which is a script that * will be evaluated in the context of another script, so we should not * check jumps (next/break/etc.) for validity. */ bool partial_script; /* Whether or not we're at the beginning of a command. */ bool command_start; /* * Whether or not we're currently parsing the body of an endless method * definition. In this context, PM_TOKEN_KEYWORD_DO_BLOCK should not be * consumed by commands (it should bubble up to the outer context). */ bool in_endless_def_body; /* Whether or not we're currently recovering from a syntax error. */ bool recovering; /* * Whether or not the source being parsed could become valid if more input * were appended. This is set to false when the parser encounters a token * that is definitively wrong (e.g., a stray `end` or `]`) as opposed to * merely incomplete. */ bool continuable; /* * This is very specialized behavior for when you want to parse in a context * that does not respect encoding comments. Its main use case is translating * into the whitequark/parser AST which re-encodes source files in UTF-8 * before they are parsed and ignores encoding comments. */ bool encoding_locked; /* * Whether or not the encoding has been changed by a magic comment. We use * this to provide a fast path for the lexer instead of going through the * function pointer. */ bool encoding_changed; /* * This flag indicates that we are currently parsing a pattern matching * expression and impacts that calculation of newlines. */ bool pattern_matching_newlines; /* This flag indicates that we are currently parsing a keyword argument. */ bool in_keyword_arg; /* * Whether or not the parser has seen a token that has semantic meaning * (i.e., a token that is not a comment or whitespace). */ bool semantic_token_seen; /* * By default, Ruby always warns about mismatched indentation. This can be * toggled with a magic comment. */ bool warn_mismatched_indentation; #if defined(PRISM_HAS_NEON) || defined(PRISM_HAS_SSSE3) || defined(PRISM_HAS_SWAR) /* * Cached lookup tables for pm_strpbrk's SIMD fast path. Avoids rebuilding * the nibble-based tables on every call when the charset hasn't changed * (which is the common case during string/regex/list lexing). */ struct { /* The cached charset (null-terminated, max 11 chars + NUL). */ uint8_t charset[12]; /* Nibble-based low lookup table for SIMD matching. */ uint8_t low_lut[16]; /* Nibble-based high lookup table for SIMD matching. */ uint8_t high_lut[16]; /* Scalar fallback table (4 x 64-bit bitmasks covering all ASCII). */ uint64_t table[4]; } strpbrk_cache; #endif }; /* * Initialize a parser with the given start and end pointers. */ void pm_parser_init(pm_arena_t *arena, pm_parser_t *parser, const uint8_t *source, size_t size, const pm_options_t *options); /* * Free the memory held by the given parser. * * This does not free the `pm_options_t` object that was used to initialize the * parser. */ void pm_parser_cleanup(pm_parser_t *parser); #endif